From 7e0528dadc9f8b04e4de0dba48a075100c2afe75 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:44 -0500 Subject: slub: Push irq disable into allocate_slab() Do the irq handling in allocate_slab() instead of __slab_alloc(). __slab_alloc() is already cluttered and allocate_slab() is already fiddling around with gfp flags. v6->v7: Only increment ORDER_FALLBACK if we get a page during fallback Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 35f351f26193..add2ae74046c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1187,6 +1187,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); + flags |= s->allocflags; /* @@ -1203,12 +1208,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(flags, node, oo); - if (!page) - return NULL; - stat(s, ORDER_FALLBACK); + if (page) + stat(s, ORDER_FALLBACK); } + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (!page) + return NULL; + if (kmemcheck_enabled && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); @@ -1849,15 +1859,8 @@ new_slab: goto load_freelist; } - gfpflags &= gfp_allowed_mask; - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - page = new_slab(s, gfpflags, node); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - if (page) { c = __this_cpu_ptr(s->cpu_slab); stat(s, ALLOC_SLAB); -- cgit v1.2.3 From 50d5c41cd151b21ac1dfc98f048210456ccacc20 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:45 -0500 Subject: slub: Do not use frozen page flag but a bit in the page counters Do not use a page flag for the frozen bit. It needs to be part of the state that is handled with cmpxchg_double(). So use a bit in the counter struct in the page struct for that purpose. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/mm_types.h | 5 +++-- include/linux/page-flags.h | 5 ----- mm/slub.c | 12 ++++++------ 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 027935c86c68..e5fb2a70518b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -41,8 +41,9 @@ struct page { * & limit reverse map searches. */ struct { /* SLUB */ - u16 inuse; - u16 objects; + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; }; }; union { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6081493db68f..20791f18f5cf 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -124,9 +124,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - - /* SLUB */ - PG_slub_frozen = PG_active, }; #ifndef __GENERATING_BOUNDS_H @@ -212,8 +209,6 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobFree, slob_free) -__PAGEFLAG(SlubFrozen, slub_frozen) - /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. diff --git a/mm/slub.c b/mm/slub.c index add2ae74046c..82b2d048a278 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -166,7 +166,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ @@ -1025,7 +1025,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } /* Special debug activities for freeing objects */ - if (!PageSlubFrozen(page) && !page->freelist) + if (!page->frozen && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -1424,7 +1424,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, { if (slab_trylock(page)) { __remove_partial(n, page); - __SetPageSlubFrozen(page); + page->frozen = 1; return 1; } return 0; @@ -1538,7 +1538,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - __ClearPageSlubFrozen(page); + page->frozen = 0; if (page->inuse) { if (page->freelist) { @@ -1868,7 +1868,7 @@ new_slab: flush_slab(s, c); slab_lock(page); - __SetPageSlubFrozen(page); + page->frozen = 1; c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -2048,7 +2048,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, page->freelist = object; page->inuse--; - if (unlikely(PageSlubFrozen(page))) { + if (unlikely(page->frozen)) { stat(s, FREE_FROZEN); goto out_unlock; } -- cgit v1.2.3 From 8cb0a5068f4108e8ca60d5e0bcfbe6901adcfaef Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:46 -0500 Subject: slub: Move page->frozen handling near where the page->freelist handling occurs This is necessary because the frozen bit has to be handled in the same cmpxchg_double with the freelist and the counters. Signed-off-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 82b2d048a278..5a2d3d8e0558 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1286,6 +1286,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = 0; + page->frozen = 1; out: return page; } @@ -1424,7 +1425,6 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, { if (slab_trylock(page)) { __remove_partial(n, page); - page->frozen = 1; return 1; } return 0; @@ -1538,7 +1538,6 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - page->frozen = 0; if (page->inuse) { if (page->freelist) { @@ -1671,6 +1670,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) } c->page = NULL; c->tid = next_tid(c->tid); + page->frozen = 0; unfreeze_slab(s, page, tail); } @@ -1831,6 +1831,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: + VM_BUG_ON(!page->frozen); + object = page->freelist; if (unlikely(!object)) goto another_slab; @@ -1854,6 +1856,7 @@ new_slab: page = get_partial(s, gfpflags, node); if (page) { stat(s, ALLOC_FROM_PARTIAL); + page->frozen = 1; c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -2371,6 +2374,7 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; + page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); -- cgit v1.2.3 From b789ef518b2a7231b0668c813f677cee528a9d3f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:49 -0500 Subject: slub: Add cmpxchg_double_slab() Add a function that operates on the second doubleword in the page struct and manipulates the object counters, the freelist and the frozen attribute. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 65 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 61 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index c8668d161dd8..b42715294147 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -33,6 +33,7 @@ enum stat_item { DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ NR_SLUB_STAT_ITEMS }; struct kmem_cache_cpu { diff --git a/mm/slub.c b/mm/slub.c index 5a2d3d8e0558..be6715dd0ee8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -170,6 +173,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ static int kmem_size = sizeof(struct kmem_cache); @@ -338,6 +342,37 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + return 1; + } + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* * Determine a map of object in use on a page. @@ -2596,6 +2631,12 @@ static int kmem_cache_open(struct kmem_cache *s, } } +#ifdef CONFIG_CMPXCHG_DOUBLE + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + /* * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. @@ -4248,8 +4289,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_DEBUG_FREE; + } return length; } SLAB_ATTR(sanity_checks); @@ -4263,8 +4306,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; + } return length; } SLAB_ATTR(trace); @@ -4281,8 +4326,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; + } calculate_sizes(s, -1); return length; } @@ -4300,8 +4347,10 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; + } calculate_sizes(s, -1); return length; } @@ -4319,8 +4368,10 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; + } calculate_sizes(s, -1); return length; } @@ -4493,6 +4544,8 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); #endif static struct attribute *slab_attrs[] = { @@ -4550,6 +4603,8 @@ static struct attribute *slab_attrs[] = { &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- cgit v1.2.3 From 5cc6eee8a8c1aefe9c86fe7345a2aa1c4ca70dc6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:50 -0500 Subject: slub: explicit list_lock taking The allocator fastpath rework does change the usage of the list_lock. Remove the list_lock processing from the functions that hide them from the critical sections and move them into those critical sections. This in turn simplifies the support functions (no __ variant needed anymore) and simplifies the lock handling on bootstrap. Inline add_partial since it becomes pretty simple. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 89 +++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index be6715dd0ee8..e39be0928a22 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -916,26 +916,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Tracking of fully allocated slabs for debugging purposes. + * + * list_lock must be held. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - spin_lock(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) + return; + list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); } +/* + * list_lock must be held. + */ static void remove_full(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n; - if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } /* Tracking of the number of slabs for debugging purposes */ @@ -1060,8 +1061,13 @@ static noinline int free_debug_processing(struct kmem_cache *s, } /* Special debug activities for freeing objects */ - if (!page->frozen && !page->freelist) + if (!page->frozen && !page->freelist) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); remove_full(s, page); + spin_unlock(&n->list_lock); + } if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); @@ -1170,7 +1176,8 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1420,38 +1427,33 @@ static __always_inline int slab_trylock(struct page *page) } /* - * Management of partially allocated slabs + * Management of partially allocated slabs. + * + * list_lock must be held. */ -static void add_partial(struct kmem_cache_node *n, +static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - spin_lock(&n->list_lock); n->nr_partial++; if (tail) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static inline void __remove_partial(struct kmem_cache_node *n, +/* + * list_lock must be held. + */ +static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } -static void remove_partial(struct kmem_cache *s, struct page *page) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - __remove_partial(n, page); - spin_unlock(&n->list_lock); -} - /* - * Lock slab and remove from the partial list. + * Lock slab, remove from the partial list and put the object into the + * per cpu freelist. * * Must hold list_lock. */ @@ -1459,7 +1461,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { - __remove_partial(n, page); + remove_partial(n, page); return 1; } return 0; @@ -1576,12 +1578,17 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) if (page->inuse) { if (page->freelist) { + spin_lock(&n->list_lock); add_partial(n, page, tail); + spin_unlock(&n->list_lock); stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); + if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) { + spin_lock(&n->list_lock); + add_full(s, n, page); + spin_unlock(&n->list_lock); + } } slab_unlock(page); } else { @@ -1597,7 +1604,9 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * kmem_cache_shrink can reclaim any empty slabs from * the partial list. */ + spin_lock(&n->list_lock); add_partial(n, page, 1); + spin_unlock(&n->list_lock); slab_unlock(page); } else { slab_unlock(page); @@ -2099,7 +2108,11 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * then add it. */ if (unlikely(!prior)) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); add_partial(get_node(s, page_to_nid(page)), page, 1); + spin_unlock(&n->list_lock); stat(s, FREE_ADD_PARTIAL); } @@ -2113,7 +2126,11 @@ slab_empty: /* * Slab still on the partial list. */ - remove_partial(s, page); + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + remove_partial(n, page); + spin_unlock(&n->list_lock); stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); @@ -2391,7 +2408,6 @@ static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -2418,14 +2434,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. - */ - local_irq_save(flags); add_partial(n, page, 0); - local_irq_restore(flags); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2709,7 +2718,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, @@ -3047,7 +3056,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * may have freed the last object and be * waiting to release the slab. */ - __remove_partial(n, page); + remove_partial(n, page); slab_unlock(page); discard_slab(s, page); } else { -- cgit v1.2.3 From 61728d1efc927eccfa64c50ede4998a8765805c3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:51 -0500 Subject: slub: Pass kmem_cache struct to lock and freeze slab We need more information about the slab for the cmpxchg implementation. Signed-off-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e39be0928a22..5cf98ff09360 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1457,8 +1457,8 @@ static inline void remove_partial(struct kmem_cache_node *n, * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, - struct page *page) +static inline int lock_and_freeze_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { remove_partial(n, page); @@ -1470,7 +1470,8 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static struct page *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n) { struct page *page; @@ -1485,7 +1486,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) + if (lock_and_freeze_slab(s, n, page)) goto out; page = NULL; out: @@ -1536,7 +1537,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(n); + page = get_partial_node(s, n); if (page) { put_mems_allowed(); return page; @@ -1556,7 +1557,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) struct page *page; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(get_node(s, searchnode)); + page = get_partial_node(s, get_node(s, searchnode)); if (page || node != NUMA_NO_NODE) return page; @@ -2081,7 +2082,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; - unsigned long flags; + unsigned long uninitialized_var(flags); local_irq_save(flags); slab_lock(page); -- cgit v1.2.3 From 2cfb7455d223ab24b23df44be430faf92e12390f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:52 -0500 Subject: slub: Rework allocator fastpaths Rework the allocation paths so that updates of the page freelist, frozen state and number of objects use cmpxchg_double_slab(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 409 ++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 280 insertions(+), 129 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5cf98ff09360..5f0346c97c5f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -992,11 +992,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; @@ -1060,14 +1055,6 @@ static noinline int free_debug_processing(struct kmem_cache *s, goto fail; } - /* Special debug activities for freeing objects */ - if (!page->frozen && !page->freelist) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - remove_full(s, page); - spin_unlock(&n->list_lock); - } if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); @@ -1178,6 +1165,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1460,11 +1448,52 @@ static inline void remove_partial(struct kmem_cache_node *n, static inline int lock_and_freeze_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { - if (slab_trylock(page)) { - remove_partial(n, page); + void *freelist; + unsigned long counters; + struct page new; + + + if (!slab_trylock(page)) + return 0; + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + new.inuse = page->objects; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + } while (!cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "lock and freeze")); + + remove_partial(n, page); + + if (freelist) { + /* Populate the per cpu freelist */ + this_cpu_write(s->cpu_slab->freelist, freelist); + this_cpu_write(s->cpu_slab->page, page); + this_cpu_write(s->cpu_slab->node, page_to_nid(page)); return 1; + } else { + /* + * Slab page came from the wrong list. No object to allocate + * from. Put it onto the correct list and continue partial + * scan. + */ + printk(KERN_ERR "SLUB: %s : Page without available objects on" + " partial list\n", s->name); + slab_unlock(page); + return 0; } - return 0; } /* @@ -1564,59 +1593,6 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) return get_any_partial(s, flags); } -/* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. - */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) - __releases(bitlock) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - if (page->inuse) { - - if (page->freelist) { - spin_lock(&n->list_lock); - add_partial(n, page, tail); - spin_unlock(&n->list_lock); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) { - spin_lock(&n->list_lock); - add_full(s, n, page); - spin_unlock(&n->list_lock); - } - } - slab_unlock(page); - } else { - stat(s, DEACTIVATE_EMPTY); - if (n->nr_partial < s->min_partial) { - /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from - * the partial list. - */ - spin_lock(&n->list_lock); - add_partial(n, page, 1); - spin_unlock(&n->list_lock); - slab_unlock(page); - } else { - slab_unlock(page); - stat(s, FREE_SLAB); - discard_slab(s, page); - } - } -} - #ifdef CONFIG_PREEMPT /* * Calculate the next globally unique transaction for disambiguiation @@ -1683,40 +1659,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } +/* + * Remove the cpu slab + */ + /* * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - __releases(bitlock) { + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct page *page = c->page; - int tail = 1; - - if (page->freelist) + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *freelist; + void *nextfree; + int tail = 0; + struct page new; + struct page old; + + if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); + tail = 1; + } + + c->tid = next_tid(c->tid); + c->page = NULL; + freelist = c->freelist; + c->freelist = NULL; + /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. */ - while (unlikely(c->freelist)) { - void **object; + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } - tail = 0; /* Hot objects. Put the slab first */ + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = get_freepointer(s, c->freelist); + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); - /* And put onto the regular freelist */ - set_freepointer(s, object, page->freelist); - page->freelist = object; - page->inuse--; + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial < s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); + } + } else { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + slab_unlock(page); + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); } - c->page = NULL; - c->tid = next_tid(c->tid); - page->frozen = 0; - unfreeze_slab(s, page, tail); } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -1851,6 +1948,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void **object; struct page *page; unsigned long flags; + struct page new; + unsigned long counters; local_irq_save(flags); #ifdef CONFIG_PREEMPT @@ -1873,25 +1972,33 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!node_match(c, node))) goto another_slab; - stat(s, ALLOC_REFILL); + stat(s, ALLOC_SLOWPATH); + + do { + object = page->freelist; + counters = page->counters; + new.counters = counters; + new.inuse = page->objects; + VM_BUG_ON(!new.frozen); + + } while (!cmpxchg_double_slab(s, page, + object, counters, + NULL, new.counters, + "__slab_alloc")); load_freelist: VM_BUG_ON(!page->frozen); - object = page->freelist; if (unlikely(!object)) goto another_slab; - if (kmem_cache_debug(s)) - goto debug; - c->freelist = get_freepointer(s, object); - page->inuse = page->objects; - page->freelist = NULL; + stat(s, ALLOC_REFILL); slab_unlock(page); + + c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); - stat(s, ALLOC_SLOWPATH); return object; another_slab: @@ -1901,9 +2008,10 @@ new_slab: page = get_partial(s, gfpflags, node); if (page) { stat(s, ALLOC_FROM_PARTIAL); - page->frozen = 1; - c->node = page_to_nid(page); - c->page = page; + object = c->freelist; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } @@ -1911,12 +2019,19 @@ new_slab: if (page) { c = __this_cpu_ptr(s->cpu_slab); - stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + page->inuse = page->objects; + + stat(s, ALLOC_SLAB); slab_lock(page); - page->frozen = 1; c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -1925,12 +2040,12 @@ new_slab: slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; + debug: - if (!alloc_debug_processing(s, page, object, addr)) - goto another_slab; + if (!object || !alloc_debug_processing(s, page, object, addr)) + goto new_slab; - page->inuse++; - page->freelist = get_freepointer(s, object); + c->freelist = get_freepointer(s, object); deactivate_slab(s, c); c->page = NULL; c->node = NUMA_NO_NODE; @@ -2082,6 +2197,11 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; + int was_frozen; + int inuse; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; unsigned long uninitialized_var(flags); local_irq_save(flags); @@ -2091,32 +2211,65 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) goto out_unlock; - prior = page->freelist; - set_freepointer(s, object, prior); - page->freelist = object; - page->inuse--; + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen && !n) { + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock(&n->list_lock); + } + inuse = new.inuse; - if (unlikely(page->frozen)) { - stat(s, FREE_FROZEN); - goto out_unlock; - } + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); - if (unlikely(!page->inuse)) - goto slab_empty; + if (likely(!n)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + goto out_unlock; + } /* - * Objects left in the slab. If it was not on the partial list before - * then add it. + * was_frozen may have been set after we acquired the list_lock in + * an earlier loop. So we need to check it here again. */ - if (unlikely(!prior)) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + if (was_frozen) + stat(s, FREE_FROZEN); + else { + if (unlikely(!inuse && n->nr_partial > s->min_partial)) + goto slab_empty; - spin_lock(&n->list_lock); - add_partial(get_node(s, page_to_nid(page)), page, 1); - spin_unlock(&n->list_lock); - stat(s, FREE_ADD_PARTIAL); + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, 0); + stat(s, FREE_ADD_PARTIAL); + } } + spin_unlock(&n->list_lock); + out_unlock: slab_unlock(page); local_irq_restore(flags); @@ -2127,13 +2280,11 @@ slab_empty: /* * Slab still on the partial list. */ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); remove_partial(n, page); - spin_unlock(&n->list_lock); stat(s, FREE_REMOVE_PARTIAL); } + + spin_unlock(&n->list_lock); slab_unlock(page); local_irq_restore(flags); stat(s, FREE_SLAB); -- cgit v1.2.3 From 881db7fb03a77af0bcd460fd1de1f4062d5c18fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:53 -0500 Subject: slub: Invert locking and avoid slab lock Locking slabs is no longer necesary if the arch supports cmpxchg operations and if no debuggin features are used on a slab. If the arch does not support cmpxchg then we fallback to use the slab lock to do a cmpxchg like operation. The patch also changes the lock order. Slab locks are subsumed to the node lock now. With that approach slab_trylocking is no longer necessary. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 129 +++++++++++++++++++++++++------------------------------------- 1 file changed, 52 insertions(+), 77 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5f0346c97c5f..ee70c091e577 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,10 +2,11 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include @@ -32,15 +33,27 @@ /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slub_lock (Global Semaphore) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * slub_lock + * + * The role of the slub_lock is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -53,20 +66,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has no one operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -342,6 +341,19 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, @@ -356,11 +368,14 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, } else #endif { + slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; + slab_unlock(page); return 1; } + slab_unlock(page); } cpu_relax(); @@ -377,7 +392,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, /* * Determine a map of object in use on a page. * - * Slab lock or node listlock must be held to guarantee that the page does + * Node listlock must be held to guarantee that the page does * not vanish from under us. */ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -808,10 +823,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; unsigned long max_objects; + fp = page->freelist; while (fp && nr <= page->objects) { if (fp == search) return 1; @@ -1024,6 +1040,8 @@ bad: static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -1059,10 +1077,12 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); + slab_unlock(page); return 1; fail: slab_fix(s, "Object at 0x%p not freed", object); + slab_unlock(page); return 0; } @@ -1393,27 +1413,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page) free_slab(s, page); } -/* - * Per slab locking using the pagelock - */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - /* * Management of partially allocated slabs. * @@ -1445,17 +1444,13 @@ static inline void remove_partial(struct kmem_cache_node *n, * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache *s, +static inline int acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { void *freelist; unsigned long counters; struct page new; - - if (!slab_trylock(page)) - return 0; - /* * Zap the freelist and set the frozen bit. * The old freelist is the list of objects for the @@ -1491,7 +1486,6 @@ static inline int lock_and_freeze_slab(struct kmem_cache *s, */ printk(KERN_ERR "SLUB: %s : Page without available objects on" " partial list\n", s->name); - slab_unlock(page); return 0; } } @@ -1515,7 +1509,7 @@ static struct page *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(s, n, page)) + if (acquire_slab(s, n, page)) goto out; page = NULL; out: @@ -1804,8 +1798,6 @@ redo: "unfreezing slab")) goto redo; - slab_unlock(page); - if (lock) spin_unlock(&n->list_lock); @@ -1819,7 +1811,6 @@ redo: static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - slab_lock(c->page); deactivate_slab(s, c); } @@ -1968,7 +1959,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!page) goto new_slab; - slab_lock(page); if (unlikely(!node_match(c, node))) goto another_slab; @@ -1994,8 +1984,6 @@ load_freelist: stat(s, ALLOC_REFILL); - slab_unlock(page); - c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); @@ -2031,7 +2019,6 @@ new_slab: page->inuse = page->objects; stat(s, ALLOC_SLAB); - slab_lock(page); c->node = page_to_nid(page); c->page = page; goto load_freelist; @@ -2205,7 +2192,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, unsigned long uninitialized_var(flags); local_irq_save(flags); - slab_lock(page); stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) @@ -2271,7 +2257,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, spin_unlock(&n->list_lock); out_unlock: - slab_unlock(page); local_irq_restore(flags); return; @@ -2285,7 +2270,6 @@ slab_empty: } spin_unlock(&n->list_lock); - slab_unlock(page); local_irq_restore(flags); stat(s, FREE_SLAB); discard_slab(s, page); @@ -3202,14 +3186,8 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ + if (!page->inuse) { remove_partial(n, page); - slab_unlock(page); discard_slab(s, page); } else { list_move(&page->lru, @@ -3797,12 +3775,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, -- cgit v1.2.3 From 5c2e4bbbd60623f1024a753c291b666068f8a6e7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:54 -0500 Subject: slub: Disable interrupts in free_debug processing We will be calling free_debug_processing with interrupts disabled in some case when the later patches are applied. Some of the functions called by free_debug_processing expect interrupts to be off. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ee70c091e577..08c57a047548 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1040,6 +1040,10 @@ bad: static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + unsigned long flags; + int rc = 0; + + local_irq_save(flags); slab_lock(page); if (!check_slab(s, page)) @@ -1056,7 +1060,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - return 0; + goto out; if (unlikely(s != page->slab)) { if (!PageSlab(page)) { @@ -1077,13 +1081,15 @@ static noinline int free_debug_processing(struct kmem_cache *s, set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); + rc = 1; +out: slab_unlock(page); - return 1; + local_irq_restore(flags); + return rc; fail: slab_fix(s, "Object at 0x%p not freed", object); - slab_unlock(page); - return 0; + goto out; } static int __init setup_slub_debug(char *str) -- cgit v1.2.3 From 80f08c191f6c9563641291bea80657a3b9faabf0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:55 -0500 Subject: slub: Avoid disabling interrupts in free slowpath Disabling interrupts can be avoided now. However, list operation still require disabling interrupts since allocations can occur from interrupt contexts and there is no way to perform atomic list operations. The acquition of the list_lock therefore has to disable interrupts as well. Dropping interrupt handling significantly simplifies the slowpath. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 08c57a047548..cb6b0857e1a6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2197,11 +2197,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, struct kmem_cache_node *n = NULL; unsigned long uninitialized_var(flags); - local_irq_save(flags); stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) - goto out_unlock; + return; do { prior = page->freelist; @@ -2220,7 +2219,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } inuse = new.inuse; @@ -2236,7 +2235,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (was_frozen) stat(s, FREE_FROZEN); - goto out_unlock; + return; } /* @@ -2259,11 +2258,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_ADD_PARTIAL); } } - - spin_unlock(&n->list_lock); - -out_unlock: - local_irq_restore(flags); + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: @@ -2275,8 +2270,7 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } - spin_unlock(&n->list_lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } -- cgit v1.2.3 From fc59c05306fe1dcfa3fb8ba34ed45407fba4689c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:56 -0500 Subject: slub: Get rid of the another_slab label We can avoid deactivate slab in special cases if we do the deactivation of slabs in each code flow that leads to new_slab. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index cb6b0857e1a6..41a15c1d8068 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1965,8 +1965,10 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!page) goto new_slab; - if (unlikely(!node_match(c, node))) - goto another_slab; + if (unlikely(!node_match(c, node))) { + deactivate_slab(s, c); + goto new_slab; + } stat(s, ALLOC_SLOWPATH); @@ -1986,7 +1988,7 @@ load_freelist: VM_BUG_ON(!page->frozen); if (unlikely(!object)) - goto another_slab; + goto new_slab; stat(s, ALLOC_REFILL); @@ -1995,9 +1997,6 @@ load_freelist: local_irq_restore(flags); return object; -another_slab: - deactivate_slab(s, c); - new_slab: page = get_partial(s, gfpflags, node); if (page) { -- cgit v1.2.3 From e36a2652d7d1ad97f7636a39bdd8654d296cc36b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:57 -0500 Subject: slub: Add statistics for the case that the current slab does not match the node Slub reloads the per cpu slab if the page does not satisfy the NUMA condition. Track those reloads since doing so has a performance impact. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index b42715294147..5b228b785377 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -24,6 +24,7 @@ enum stat_item { ALLOC_FROM_PARTIAL, /* Cpu slab acquired from partial list */ ALLOC_SLAB, /* Cpu slab acquired from page allocator */ ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ FREE_SLAB, /* Slab freed to the page allocator */ CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ diff --git a/mm/slub.c b/mm/slub.c index 41a15c1d8068..e00b7732f556 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1966,6 +1966,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto new_slab; if (unlikely(!node_match(c, node))) { + stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, c); goto new_slab; } @@ -4671,6 +4672,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4730,6 +4732,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, -- cgit v1.2.3 From 03e404af26dc2ea0d278d7a342de0aab394793ce Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:58 -0500 Subject: slub: fast release on full slab Make deactivation occur implicitly while checking out the current freelist. This avoids one cmpxchg operation on a slab that is now fully in use. Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5b228b785377..71441f89729b 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -32,6 +32,7 @@ enum stat_item { DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ diff --git a/mm/slub.c b/mm/slub.c index e00b7732f556..25dac48c1c60 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1977,9 +1977,21 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, object = page->freelist; counters = page->counters; new.counters = counters; - new.inuse = page->objects; VM_BUG_ON(!new.frozen); + /* + * If there is no object left then we use this loop to + * deactivate the slab which is simple since no objects + * are left in the slab and therefore we do not need to + * put the page back onto the partial list. + * + * If there are objects left then we retrieve them + * and use them to refill the per cpu queue. + */ + + new.inuse = page->objects; + new.frozen = object != NULL; + } while (!cmpxchg_double_slab(s, page, object, counters, NULL, new.counters, @@ -1988,8 +2000,11 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, load_freelist: VM_BUG_ON(!page->frozen); - if (unlikely(!object)) + if (unlikely(!object)) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); goto new_slab; + } stat(s, ALLOC_REFILL); @@ -4680,6 +4695,7 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); @@ -4740,6 +4756,7 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, &cmpxchg_double_cpu_fail_attr.attr, -- cgit v1.2.3 From 4eade540fc35353813097bfdb39465c9b8847a15 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Jun 2011 12:25:59 -0500 Subject: slub: Not necessary to check for empty slab on load_freelist load_freelist is now only branched to only if there are objects available. So no need to check the object variable for NULL. Signed-off-by: Pekka Enberg --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 25dac48c1c60..78c488202f7d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1997,9 +1997,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, NULL, new.counters, "__slab_alloc")); -load_freelist: - VM_BUG_ON(!page->frozen); - if (unlikely(!object)) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); @@ -2008,6 +2005,8 @@ load_freelist: stat(s, ALLOC_REFILL); +load_freelist: + VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); -- cgit v1.2.3 From 1d07171c5e58e68a76a141970a3a5e816a414ce6 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jul 2011 12:49:12 -0500 Subject: slub: disable interrupts in cmpxchg_double_slab when falling back to pagelock Split cmpxchg_double_slab into two functions. One for the case where we know that interrupts are disabled (and therefore the fallback does not need to disable interrupts) and one for the other cases where fallback will also disable interrupts. This fixes the issue that __slab_free called cmpxchg_double_slab in some scenarios without disabling interrupts. Tested-by: Hugh Dickins Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 78c488202f7d..7836b45ea1fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -354,6 +354,42 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, @@ -368,14 +404,19 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, } else #endif { + unsigned long flags; + + local_irq_save(flags); slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; slab_unlock(page); + local_irq_restore(flags); return 1; } slab_unlock(page); + local_irq_restore(flags); } cpu_relax(); @@ -1471,7 +1512,7 @@ static inline int acquire_slab(struct kmem_cache *s, VM_BUG_ON(new.frozen); new.frozen = 1; - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, freelist, counters, NULL, new.counters, "lock and freeze")); @@ -1709,7 +1750,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) new.inuse--; VM_BUG_ON(!new.frozen); - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, prior, counters, freelist, new.counters, "drain percpu freelist")); @@ -1798,7 +1839,7 @@ redo: } l = m; - if (!cmpxchg_double_slab(s, page, + if (!__cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) @@ -1992,7 +2033,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, new.inuse = page->objects; new.frozen = object != NULL; - } while (!cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, page, object, counters, NULL, new.counters, "__slab_alloc")); -- cgit v1.2.3 From 9e577e8b46ab0c38970c0f0cd7eae62e6dffddee Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 22 Jul 2011 09:35:14 -0500 Subject: slub: When allocating a new slab also prep the first object We need to branch to the debug code for the first object if we allocate a new slab otherwise the first object will be marked wrongly as inactive. Tested-by: Rabin Vincent Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7836b45ea1fa..e842c19e67fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2082,6 +2082,9 @@ new_slab: stat(s, ALLOC_SLAB); c->node = page_to_nid(page); c->page = page; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) -- cgit v1.2.3