summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c386
1 files changed, 267 insertions, 119 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c469c1dfb8b..f8f3bfc435ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -522,12 +522,6 @@ static void bad_page(struct page *page, const char *reason,
static unsigned long nr_shown;
static unsigned long nr_unshown;
- /* Don't complain about poisoned pages */
- if (PageHWPoison(page)) {
- page_mapcount_reset(page); /* remove PageBuddy */
- return;
- }
-
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
@@ -613,14 +607,7 @@ static int __init early_debug_pagealloc(char *buf)
{
if (!buf)
return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- _debug_pagealloc_enabled = true;
-
- if (strcmp(buf, "off") == 0)
- _debug_pagealloc_enabled = false;
-
- return 0;
+ return kstrtobool(buf, &_debug_pagealloc_enabled);
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1000,7 +987,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
- kasan_free_pages(page, order);
/*
* Check tail pages before head page information is cleared to
@@ -1042,6 +1028,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
+ kasan_free_pages(page, order);
return true;
}
@@ -1212,7 +1199,7 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -1661,6 +1648,9 @@ static void check_new_page_bad(struct page *page)
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
bad_flags = __PG_HWPOISON;
+ /* Don't complain about hwpoisoned pages */
+ page_mapcount_reset(page); /* remove PageBuddy */
+ return;
}
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
@@ -2750,10 +2740,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* one free page of a suitable size. Checking now avoids taking the zone lock
* to check in the allocation paths if no pages are free.
*/
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx,
- unsigned int alloc_flags,
- long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ int classzone_idx, unsigned int alloc_flags,
+ long free_pages)
{
long min = mark;
int o;
@@ -3180,34 +3169,33 @@ out:
return page;
}
+
+/*
+ * Maximum number of compaction retries wit a progress before OOM
+ * killer is consider as the only way to move forward.
+ */
+#define MAX_COMPACT_RETRIES 16
+
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended_compaction,
- bool *deferred_compaction)
+ enum migrate_mode mode, enum compact_result *compact_result)
{
- unsigned long compact_result;
struct page *page;
+ int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
- compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, contended_compaction);
+ *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ mode, &contended_compaction);
current->flags &= ~PF_MEMALLOC;
- switch (compact_result) {
- case COMPACT_DEFERRED:
- *deferred_compaction = true;
- /* fall-through */
- case COMPACT_SKIPPED:
+ if (*compact_result <= COMPACT_INACTIVE)
return NULL;
- default:
- break;
- }
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3233,19 +3221,112 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTFAIL);
+ /*
+ * In all zones where compaction was attempted (and not
+ * deferred or skipped), lock contention has been detected.
+ * For THP allocation we do not want to disrupt the others
+ * so we fallback to base pages instead.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ *compact_result = COMPACT_CONTENDED;
+
+ /*
+ * If compaction was aborted due to need_resched(), we do not
+ * want to further increase allocation latency, unless it is
+ * khugepaged trying to collapse.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_SCHED
+ && !(current->flags & PF_KTHREAD))
+ *compact_result = COMPACT_CONTENDED;
+
cond_resched();
return NULL;
}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+ enum compact_result compact_result, enum migrate_mode *migrate_mode,
+ int compaction_retries)
+{
+ int max_retries = MAX_COMPACT_RETRIES;
+
+ if (!order)
+ return false;
+
+ /*
+ * compaction considers all the zone as desperately out of memory
+ * so it doesn't really make much sense to retry except when the
+ * failure could be caused by weak migration mode.
+ */
+ if (compaction_failed(compact_result)) {
+ if (*migrate_mode == MIGRATE_ASYNC) {
+ *migrate_mode = MIGRATE_SYNC_LIGHT;
+ return true;
+ }
+ return false;
+ }
+
+ /*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But do not retry if the given zonelist is not suitable for
+ * compaction.
+ */
+ if (compaction_withdrawn(compact_result))
+ return compaction_zonelist_suitable(ac, order, alloc_flags);
+
+ /*
+ * !costly requests are much more important than __GFP_REPEAT
+ * costly ones because they are de facto nofail and invoke OOM
+ * killer to move on while costly can fail and users are ready
+ * to cope with that. 1/4 retries is rather arbitrary but we
+ * would need much more detailed feedback from compaction to
+ * make a better decision.
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ max_retries /= 4;
+ if (compaction_retries <= max_retries)
+ return true;
+
+ return false;
+}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended_compaction,
- bool *deferred_compaction)
+ enum migrate_mode mode, enum compact_result *compact_result)
{
+ *compact_result = COMPACT_SKIPPED;
return NULL;
}
+
+static inline bool
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+ enum compact_result compact_result,
+ enum migrate_mode *migrate_mode,
+ int compaction_retries)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * There are setups with compaction disabled which would prefer to loop
+ * inside the allocator rather than hit the oom killer prematurely.
+ * Let's give them a good hope and keep retrying while the order-0
+ * watermarks are OK.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
+ ac_classzone_idx(ac), alloc_flags))
+ return true;
+ }
+ return false;
+}
#endif /* CONFIG_COMPACTION */
/* Perform direct synchronous page reclaim */
@@ -3377,6 +3458,101 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
}
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round) and no_progress_loops (number of reclaim rounds without
+ * any progress in a row) is considered as well as the reclaimable pages on the
+ * applicable zone list (with a backoff mechanism which is a function of
+ * no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+ struct alloc_context *ac, int alloc_flags,
+ bool did_some_progress, int no_progress_loops)
+{
+ struct zone *zone;
+ struct zoneref *z;
+
+ /*
+ * Make sure we converge to OOM if we cannot make any progress
+ * several times in the row.
+ */
+ if (no_progress_loops > MAX_RECLAIM_RETRIES)
+ return false;
+
+ /*
+ * Keep reclaiming pages while there is a chance this will lead somewhere.
+ * If none of the target zones can satisfy our allocation request even
+ * if all reclaimable pages are considered then we are screwed and have
+ * to go OOM.
+ */
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ unsigned long available;
+ unsigned long reclaimable;
+
+ available = reclaimable = zone_reclaimable_pages(zone);
+ available -= DIV_ROUND_UP(no_progress_loops * available,
+ MAX_RECLAIM_RETRIES);
+ available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+ /*
+ * Would the allocation succeed if we reclaimed the whole
+ * available?
+ */
+ if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+ ac_classzone_idx(ac), alloc_flags, available)) {
+ /*
+ * If we didn't make any progress and have a lot of
+ * dirty + writeback pages then we should wait for
+ * an IO to complete to slow down the reclaim and
+ * prevent from pre mature OOM
+ */
+ if (!did_some_progress) {
+ unsigned long writeback;
+ unsigned long dirty;
+
+ writeback = zone_page_state_snapshot(zone,
+ NR_WRITEBACK);
+ dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+
+ if (2*(writeback + dirty) > reclaimable) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
+ return true;
+ }
+ }
+
+ /*
+ * Memory allocation/reclaim might be called from a WQ
+ * context and the current implementation of the WQ
+ * concurrency control doesn't recognize that
+ * a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to
+ * do a short sleep here rather than calling
+ * cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -3384,11 +3560,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
unsigned int alloc_flags;
- unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
- bool deferred_compaction = false;
- int contended_compaction = COMPACT_CONTENDED_NONE;
+ enum compact_result compact_result;
+ int compaction_retries = 0;
+ int no_progress_loops = 0;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3475,8 +3651,7 @@ retry:
*/
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
migration_mode,
- &contended_compaction,
- &deferred_compaction);
+ &compact_result);
if (page)
goto got_pg;
@@ -3489,35 +3664,19 @@ retry:
* to heavily disrupt the system, so we fail the allocation
* instead of entering direct reclaim.
*/
- if (deferred_compaction)
- goto nopage;
-
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
+ * Compaction is contended so rather back off than cause
+ * excessive stalls.
*/
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
+ if(compact_result == COMPACT_CONTENDED)
goto nopage;
}
- /*
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse.
- */
- if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_SYNC_LIGHT;
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3529,14 +3688,38 @@ retry:
if (gfp_mask & __GFP_NORETRY)
goto noretry;
- /* Keep reclaiming pages as long as there is reasonable progress */
- pages_reclaimed += did_some_progress;
- if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
- ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
- /* Wait for some write requests to complete then retry */
- wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
+ /*
+ * Do not retry costly high order allocations unless they are
+ * __GFP_REPEAT
+ */
+ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+ goto noretry;
+
+ /*
+ * Costly allocations might have made a progress but this doesn't mean
+ * their order will become available due to high fragmentation so
+ * always increment the no progress counter for them
+ */
+ if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
+ no_progress_loops = 0;
+ else
+ no_progress_loops++;
+
+ if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+ did_some_progress > 0, no_progress_loops))
+ goto retry;
+
+ /*
+ * It doesn't make any sense to retry for the compaction if the order-0
+ * reclaim is not able to make any progress because the current
+ * implementation of the compaction depends on the sufficient amount
+ * of free memory (see __compaction_suitable)
+ */
+ if (did_some_progress > 0 &&
+ should_compact_retry(ac, order, alloc_flags,
+ compact_result, &migration_mode,
+ compaction_retries))
goto retry;
- }
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -3544,19 +3727,28 @@ retry:
goto got_pg;
/* Retry as long as the OOM killer is making progress */
- if (did_some_progress)
+ if (did_some_progress) {
+ no_progress_loops = 0;
goto retry;
+ }
noretry:
/*
- * High-order allocations do not necessarily loop after
- * direct reclaim and reclaim/compaction depends on compaction
- * being called after reclaim so call directly if necessary
+ * High-order allocations do not necessarily loop after direct reclaim
+ * and reclaim/compaction depends on compaction being called after
+ * reclaim so call directly if necessary.
+ * It can become very expensive to allocate transparent hugepages at
+ * fault, so use asynchronous memory compaction for THP unless it is
+ * khugepaged trying to collapse. All other requests should tolerate
+ * at least light sync migration.
*/
+ if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
+ migration_mode = MIGRATE_ASYNC;
+ else
+ migration_mode = MIGRATE_SYNC_LIGHT;
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
ac, migration_mode,
- &contended_compaction,
- &deferred_compaction);
+ &compact_result);
if (page)
goto got_pg;
nopage:
@@ -6671,49 +6863,6 @@ void setup_per_zone_wmarks(void)
}
/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total target max
- * memory ratio inactive anon
- * -------------------------------------
- * 10MB 1 5MB
- * 100MB 1 50MB
- * 1GB 3 250MB
- * 10GB 10 0.9GB
- * 100GB 31 3GB
- * 1TB 101 10GB
- * 10TB 320 32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
- unsigned int gb, ratio;
-
- /* Zone size in gigabytes */
- gb = zone->managed_pages >> (30 - PAGE_SHIFT);
- if (gb)
- ratio = int_sqrt(10 * gb);
- else
- ratio = 1;
-
- zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
- struct zone *zone;
-
- for_each_zone(zone)
- calculate_zone_inactive_ratio(zone);
-}
-
-/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
@@ -6758,7 +6907,6 @@ int __meminit init_per_zone_wmark_min(void)
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
- setup_per_zone_inactive_ratio();
return 0;
}
core_initcall(init_per_zone_wmark_min)