summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/compaction.c17
-rw-r--r--mm/fadvise.c15
-rw-r--r--mm/filemap.c245
-rw-r--r--mm/gup.c20
-rw-r--r--mm/huge_memory.c196
-rw-r--r--mm/hugetlb.c37
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kasan/kasan.c9
-rw-r--r--mm/kasan/report.c3
-rw-r--r--mm/khugepaged.c57
-rw-r--r--mm/memcontrol.c24
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c922
-rw-r--r--mm/memory_hotplug.c56
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/page_alloc.c126
-rw-r--r--mm/page_io.c5
-rw-r--r--mm/percpu.c3
-rw-r--r--mm/process_vm_access.c12
-rw-r--r--mm/shmem.c47
-rw-r--r--mm/slab.c15
-rw-r--r--mm/slub.c27
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/truncate.c75
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/workingset.c3
-rw-r--r--mm/zswap.c30
38 files changed, 1251 insertions, 825 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fde443f36d7..3bfed5ab2475 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
spin_lock_init(&wb->work_lock);
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+ wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
if (!wb->congested)
diff --git a/mm/compaction.c b/mm/compaction.c
index 223464227299..949198d01260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page_count(page) > page_mapcount(page))
goto isolate_fail;
+ /*
+ * Only allow to migrate anonymous pages in GFP_NOFS context
+ * because those do not depend on fs locks.
+ */
+ if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ goto isolate_fail;
+
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio)
{
- int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /* Check if the GFP flags allow compaction */
- if (!may_enter_fs || !may_perform_io)
+ /*
+ * Check if the GFP flags allow compaction - GFP_NOIO is really
+ * tricky context because the migration might require IO
+ */
+ if (!may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1751,6 +1760,7 @@ static void compact_node(int nid)
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
};
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
+ .gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 6c707bfe02fd..a43013112581 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -139,7 +139,20 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
}
if (end_index >= start_index) {
- unsigned long count = invalidate_mapping_pages(mapping,
+ unsigned long count;
+
+ /*
+ * It's common to FADV_DONTNEED right after
+ * the read or write that instantiates the
+ * pages, in which case there will be some
+ * sitting on the local LRU cache. Try to
+ * avoid the expensive remote drain and the
+ * second cache tree walk below by flushing
+ * them out right away.
+ */
+ lru_add_drain();
+
+ count = invalidate_mapping_pages(mapping,
start_index, end_index);
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 5b4dd03130da..3f9afded581b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,11 +135,10 @@ static int page_cache_tree_insert(struct address_space *mapping,
} else {
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE(p !=
- (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
- RADIX_DAX_ENTRY_LOCK));
+ dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
/* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index,
- false);
+ dax_wake_mapping_entry_waiter(mapping, page->index, p,
+ true);
}
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
@@ -740,45 +739,159 @@ EXPORT_SYMBOL(__page_cache_alloc);
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-wait_queue_head_t *page_waitqueue(struct page *page)
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
{
- return bit_waitqueue(page, 0);
+ return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
}
-EXPORT_SYMBOL(page_waitqueue);
-void wait_on_page_bit(struct page *page, int bit_nr)
+void __init pagecache_init(void)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ int i;
+
+ for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+ init_waitqueue_head(&page_wait_table[i]);
- if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
- TASK_UNINTERRUPTIBLE);
+ page_writeback_init();
}
-EXPORT_SYMBOL(wait_on_page_bit);
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+struct wait_page_key {
+ struct page *page;
+ int bit_nr;
+ int page_match;
+};
+
+struct wait_page_queue {
+ struct page *page;
+ int bit_nr;
+ wait_queue_t wait;
+};
+
+static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ struct wait_page_key *key = arg;
+ struct wait_page_queue *wait_page
+ = container_of(wait, struct wait_page_queue, wait);
- if (!test_bit(bit_nr, &page->flags))
+ if (wait_page->page != key->page)
+ return 0;
+ key->page_match = 1;
+
+ if (wait_page->bit_nr != key->bit_nr)
+ return 0;
+ if (test_bit(key->bit_nr, &key->page->flags))
return 0;
- return __wait_on_bit(page_waitqueue(page), &wait,
- bit_wait_io, TASK_KILLABLE);
+ return autoremove_wake_function(wait, mode, sync, key);
}
-int wait_on_page_bit_killable_timeout(struct page *page,
- int bit_nr, unsigned long timeout)
+void wake_up_page_bit(struct page *page, int bit_nr)
{
- DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+ wait_queue_head_t *q = page_waitqueue(page);
+ struct wait_page_key key;
+ unsigned long flags;
- wait.key.timeout = jiffies + timeout;
- if (!test_bit(bit_nr, &page->flags))
- return 0;
- return __wait_on_bit(page_waitqueue(page), &wait,
- bit_wait_io_timeout, TASK_KILLABLE);
+ key.page = page;
+ key.bit_nr = bit_nr;
+ key.page_match = 0;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_locked_key(q, TASK_NORMAL, &key);
+ /*
+ * It is possible for other pages to have collided on the waitqueue
+ * hash, so in that case check for a page match. That prevents a long-
+ * term waiter
+ *
+ * It is still possible to miss a case here, when we woke page waiters
+ * and removed them from the waitqueue, but there are still other
+ * page waiters.
+ */
+ if (!waitqueue_active(q) || !key.page_match) {
+ ClearPageWaiters(page);
+ /*
+ * It's possible to miss clearing Waiters here, when we woke
+ * our page waiters, but the hashed waitqueue has waiters for
+ * other pages on it.
+ *
+ * That's okay, it's a rare case. The next waker will clear it.
+ */
+ }
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(wake_up_page_bit);
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+ struct page *page, int bit_nr, int state, bool lock)
+{
+ struct wait_page_queue wait_page;
+ wait_queue_t *wait = &wait_page.wait;
+ int ret = 0;
+
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.page = page;
+ wait_page.bit_nr = bit_nr;
+
+ for (;;) {
+ spin_lock_irq(&q->lock);
+
+ if (likely(list_empty(&wait->task_list))) {
+ if (lock)
+ __add_wait_queue_tail_exclusive(q, wait);
+ else
+ __add_wait_queue(q, wait);
+ SetPageWaiters(page);
+ }
+
+ set_current_state(state);
+
+ spin_unlock_irq(&q->lock);
+
+ if (likely(test_bit(bit_nr, &page->flags))) {
+ io_schedule();
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
+ }
+
+ if (lock) {
+ if (!test_and_set_bit_lock(bit_nr, &page->flags))
+ break;
+ } else {
+ if (!test_bit(bit_nr, &page->flags))
+ break;
+ }
+ }
+
+ finish_wait(q, wait);
+
+ /*
+ * A signal could leave PageWaiters set. Clearing it here if
+ * !waitqueue_active would be possible (by open-coding finish_wait),
+ * but still fail to catch it in the case of wait hash collision. We
+ * already can fail to clear wait hash collision cases, so don't
+ * bother with signals either.
+ */
+
+ return ret;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}
-EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -794,10 +907,34 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, waiter);
+ SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(add_page_wait_queue);
+#ifndef clear_bit_unlock_is_negative_byte
+
+/*
+ * PG_waiters is the high bit in the same byte as PG_lock.
+ *
+ * On x86 (and on many other architectures), we can clear PG_lock and
+ * test the sign bit at the same time. But if the architecture does
+ * not support that special operation, we just do this all by hand
+ * instead.
+ *
+ * The read of PG_waiters has to be after (or concurrently with) PG_locked
+ * being cleared, but a memory barrier should be unneccssary since it is
+ * in the same byte as PG_locked.
+ */
+static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
+{
+ clear_bit_unlock(nr, mem);
+ /* smp_mb__after_atomic(); */
+ return test_bit(PG_waiters, mem);
+}
+
+#endif
+
/**
* unlock_page - unlock a locked page
* @page: the page
@@ -807,16 +944,19 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+ * Note that this depends on PG_waiters being the sign bit in the byte
+ * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
+ * clear the PG_locked bit and test PG_waiters at the same time fairly
+ * portably (architectures that do LL/SC can test any bit, while x86 can
+ * test the sign bit).
*/
void unlock_page(struct page *page)
{
+ BUILD_BUG_ON(PG_waiters != 7);
page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_atomic();
- wake_up_page(page, PG_locked);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+ wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);
@@ -875,23 +1015,19 @@ EXPORT_SYMBOL_GPL(page_endio);
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
*/
-void __lock_page(struct page *page)
+void __lock_page(struct page *__page)
{
- struct page *page_head = compound_head(page);
- DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
- __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
- TASK_UNINTERRUPTIBLE);
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
}
EXPORT_SYMBOL(__lock_page);
-int __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *__page)
{
- struct page *page_head = compound_head(page);
- DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
-
- return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
- bit_wait_io, TASK_KILLABLE);
+ struct page *page = compound_head(__page);
+ wait_queue_head_t *q = page_waitqueue(page);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1638,7 +1774,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
int error = 0;
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
- return -EINVAL;
+ return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
index = *ppos >> PAGE_SHIFT;
@@ -1655,6 +1791,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
cond_resched();
find_page:
+ if (fatal_signal_pending(current)) {
+ error = -EINTR;
+ goto out;
+ }
+
page = find_get_page(mapping, index);
if (!page) {
page_cache_sync_readahead(mapping,
@@ -2165,12 +2306,12 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = fe->vma->vm_file;
+ struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
@@ -2226,11 +2367,11 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
- if (fe->pte)
- fe->pte += iter.index - last_pgoff;
+ vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ if (vmf->pte)
+ vmf->pte += iter.index - last_pgoff;
last_pgoff = iter.index;
- if (alloc_set_pte(fe, NULL, page))
+ if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
goto next;
@@ -2240,7 +2381,7 @@ skip:
put_page(page);
next:
/* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*fe->pmd))
+ if (pmd_trans_huge(*vmf->pmd))
break;
if (iter.index == end_pgoff)
break;
diff --git a/mm/gup.c b/mm/gup.c
index e50178c58b97..55315555489d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked);
* caller if required (just like with __get_user_pages). "FOLL_GET"
* is set implicitly if "pages" is non-NULL.
*/
-__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
int locked = 1;
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas, int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- NULL, false,
+ locked, true,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's. We also
- * obviously don't pass FOLL_REMOTE in here.
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cee42cf05477..5f3ad65c85de 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
gfp_t gfp)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
*/
__SetPageUptodate(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
@@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&vma->vm_mm->nr_ptes);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct fault_env *fe)
+int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
@@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
set = false;
- if (pmd_none(*fe->pmd)) {
+ if (pmd_none(*vmf->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(fe->ptl);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ spin_unlock(vmf->ptl);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
- haddr, fe->pmd, zero_page);
- spin_unlock(fe->ptl);
+ haddr, vmf->pmd, zero_page);
+ spin_unlock(vmf->ptl);
set = true;
}
} else
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
if (!set)
pte_free(vma->vm_mm, pgtable);
return ret;
@@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(fe, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -783,6 +783,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd));
+ /*
+ * When we COW a devmap PMD entry, we split it into PTEs, so we should
+ * not be in this function with `flags & FOLL_COW` set.
+ */
+ WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -879,30 +885,32 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
- fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = fe->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
- fe->flags & FAULT_FLAG_WRITE))
- update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
+ if (write)
+ entry = pmd_mkdirty(entry);
+ haddr = vmf->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
@@ -919,9 +927,8 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
- __GFP_OTHER_NODE, vma,
- fe->address, page_to_nid(page));
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
@@ -952,15 +959,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -969,20 +976,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- fe->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*fe->pte));
- set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
- pte_unmap(fe->pte);
+ vmf->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*vmf->pte));
+ set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+ pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, fe->pmd, pgtable);
+ pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -993,7 +1000,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
@@ -1005,23 +1012,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
- fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1034,13 +1041,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1053,12 +1060,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
+ ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1070,7 +1077,7 @@ alloc:
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
put_page(page);
ret |= VM_FAULT_FALLBACK;
@@ -1090,11 +1097,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(fe->ptl);
+ spin_lock(vmf->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
- spin_unlock(fe->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1102,12 +1109,12 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
} else {
@@ -1117,16 +1124,26 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out_mn:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
@@ -1137,7 +1154,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
goto out;
/* Avoid dumping huge zero page */
@@ -1196,12 +1213,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
+int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1209,8 +1226,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
bool was_writable;
int flags = 0;
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(pmd, *fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd)))
goto out_unlock;
/*
@@ -1218,9 +1235,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*fe->pmd))) {
- page = pmd_page(*fe->pmd);
- spin_unlock(fe->ptl);
+ if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
+ page = pmd_page(*vmf->pmd);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1253,7 +1270,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1264,12 +1281,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* to serialises splits
*/
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(pmd, *fe->pmd))) {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1287,9 +1304,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- fe->pmd, pmd, fe->address, page, target_nid);
+ vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1304,18 +1321,19 @@ clear_pmdnuma:
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+ vmf->flags);
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3edb759c5c7d..c7025c132670 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1773,23 +1773,32 @@ free:
}
/*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ * in unused_resv_pages. This corresponds to the prior adjustments made
+ * to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ * the reservation. As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held. However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock. Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing. Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- /* Uncommit the reservation */
- h->resv_huge_pages -= unused_resv_pages;
-
/* Cannot return gigantic pages currently */
if (hstate_is_gigantic(h))
- return;
+ goto out;
+ /*
+ * Part (or even all) of the reservation could have been backed
+ * by pre-allocated pages. Only free surplus pages.
+ */
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
/*
@@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages(struct hstate *h,
* when the nodes with surplus pages have no free pages.
* free_pool_huge_page() will balance the the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
+ *
+ * Note that we decrement resv_huge_pages as we free the pages. If
+ * we drop the lock, resv_huge_pages will still be sufficiently large
+ * to cover subsequent pages we may free.
*/
while (nr_pages--) {
+ h->resv_huge_pages--;
+ unused_resv_pages--;
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
- break;
+ goto out;
cond_resched_lock(&hugetlb_lock);
}
+
+out:
+ /* Fully uncommit the reservation */
+ h->resv_huge_pages -= unused_resv_pages;
}
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a56a851908d2..975e49f00f34 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,6 +6,7 @@
#include <linux/cpumask.h>
#include <linux/atomic.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -21,5 +22,6 @@ struct mm_struct init_mm = {
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+ .user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..7aa2ea0a8623 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,7 +36,9 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+void page_writeback_init(void);
+
+int do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0e9505f66ec1..b2a0cff2bb35 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
/* Unpoison the stack for the current task beyond a watermark sp value. */
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
{
- __kasan_unpoison_stack(current, watermark);
+ /*
+ * Calculate the task stack base address. Avoid using 'current'
+ * because this function is called by early resume code which hasn't
+ * yet set up the percpu register (%gs).
+ */
+ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+ kasan_unpoison_shadow(base, watermark - base);
}
/*
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b82b3e215157..f479365530b6 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,6 +13,7 @@
*
*/
+#include <linux/ftrace.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -300,6 +301,8 @@ void kasan_report(unsigned long addr, size_t size,
if (likely(!kasan_report_enabled()))
return;
+ disable_trace_on_warning();
+
info.access_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 09460955e818..77ae3239c3de 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- pte_t pteval;
int swapped_in = 0, ret = 0;
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
+ .pgoff = linear_page_index(vma, address),
};
/* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
- fe.pte = pte_offset_map(pmd, address);
- for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- fe.pte++, fe.address += PAGE_SIZE) {
- pteval = *fe.pte;
- if (!is_swap_pte(pteval))
+ vmf.pte = pte_offset_map(pmd, address);
+ for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ vmf.pte++, vmf.address += PAGE_SIZE) {
+ vmf.orig_pte = *vmf.pte;
+ if (!is_swap_pte(vmf.orig_pte))
continue;
swapped_in++;
- ret = do_swap_page(&fe, pteval);
+ ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
- fe.pte = pte_offset_map(pmd, fe.address);
+ vmf.pte = pte_offset_map(pmd, vmf.address);
}
- fe.pte--;
- pte_unmap(fe.pte);
+ vmf.pte--;
+ pte_unmap(vmf.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -943,7 +943,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
/*
* Before allocating the hugepage, release the mmap_sem read lock.
@@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
- bool deposited = false;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
- /*
- * now deposit the pgtable for arch that need it
- * otherwise free it.
- */
- if (arch_needs_pgtable_deposit()) {
- /*
- * The deposit should be visibile only after
- * collapse is seen by others.
- */
- smp_wmb();
- pgtable_trans_huge_deposit(vma->vm_mm, pmd,
- pmd_pgtable(_pmd));
- deposited = true;
- }
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
- if (!deposited) {
- atomic_long_dec(&vma->vm_mm->nr_ptes);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
- }
+ atomic_long_dec(&vma->vm_mm->nr_ptes);
+ pte_free(vma->vm_mm, pmd_pgtable(_pmd));
}
}
i_mmap_unlock_write(mapping);
@@ -1326,8 +1309,7 @@ static void collapse_shmem(struct mm_struct *mm,
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() |
- __GFP_OTHER_NODE | __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
@@ -1446,7 +1428,7 @@ static void collapse_shmem(struct mm_struct *mm,
radix_tree_replace_slot(&mapping->page_tree, slot,
new_page + (index % HPAGE_PMD_NR));
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_resume(slot, &iter);
index++;
continue;
out_lru:
@@ -1546,7 +1528,6 @@ tree_unlocked:
/* Put holes back where they were */
radix_tree_delete(&mapping->page_tree,
iter.index);
- slot = radix_tree_iter_next(&iter);
continue;
}
@@ -1557,11 +1538,11 @@ tree_unlocked:
page_ref_unfreeze(page, 2);
radix_tree_replace_slot(&mapping->page_tree,
slot, page);
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
putback_lru_page(page);
unlock_page(page);
spin_lock_irq(&mapping->tree_lock);
- slot = radix_tree_iter_next(&iter);
}
VM_BUG_ON(nr_none);
spin_unlock_irq(&mapping->tree_lock);
@@ -1641,8 +1622,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
present++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 175ec51c346d..b822e158b319 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,7 +68,7 @@
#include <net/ip.h>
#include "slab.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <trace/events/vmscan.h>
@@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
unsigned long nr = 0;
- struct mem_cgroup_per_node *mz;
enum lru_list lru;
VM_BUG_ON((unsigned)nid >= nr_node_ids);
@@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- mz = mem_cgroup_nodeinfo(memcg, nid);
- nr += mz->lru_size[lru];
+ nr += mem_cgroup_get_lru_size(lruvec, lru);
}
return nr;
}
@@ -1002,6 +1001,7 @@ out:
* mem_cgroup_update_lru_size - account for adding or removing an lru page
* @lruvec: mem_cgroup per zone lru vector
* @lru: index of lru list the page is sitting on
+ * @zid: zone id of the accounted pages
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
@@ -1009,27 +1009,25 @@ out:
* so as to allow it to check that lru_size 0 is consistent with list_empty).
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
- int nr_pages)
+ int zid, int nr_pages)
{
struct mem_cgroup_per_node *mz;
unsigned long *lru_size;
long size;
- bool empty;
if (mem_cgroup_disabled())
return;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- lru_size = mz->lru_size + lru;
- empty = list_empty(lruvec->lists + lru);
+ lru_size = &mz->lru_zone_size[zid][lru];
if (nr_pages < 0)
*lru_size += nr_pages;
size = *lru_size;
- if (WARN_ONCE(size < 0 || empty != !size,
- "%s(%p, %d, %d): lru_size %ld but %sempty\n",
- __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+ if (WARN_ONCE(size < 0,
+ "%s(%p, %d, %d): lru_size %ld\n",
+ __func__, lruvec, lru, nr_pages, size)) {
VM_BUG_ON(1);
*lru_size = 0;
}
@@ -4355,9 +4353,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
return ret;
}
- /* Try charges one by one with reclaim */
+ /* Try charges one by one with reclaim, but do not retry */
while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+ ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
if (ret)
return ret;
mc.precharge++;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 19e796d36a62..f283c7e0a2a3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -764,12 +764,11 @@ static int me_huge_page(struct page *p, unsigned long pfn)
*/
#define dirty (1UL << PG_dirty)
-#define sc (1UL << PG_swapcache)
+#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
-#define swapbacked (1UL << PG_swapbacked)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
@@ -819,7 +818,6 @@ static struct page_state {
#undef mlock
#undef writeback
#undef lru
-#undef swapbacked
#undef head
#undef slab
#undef reserved
diff --git a/mm/memory.c b/mm/memory.c
index 32e9b7aec366..6bf2b471e30c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,7 +68,7 @@
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
{
- struct vm_fault vmf;
int ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.page = page;
- vmf.cow_page = NULL;
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ struct address_space *mapping;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+}
+
+/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
- struct page *page, int page_mkwrite, int dirty_shared)
- __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
- update_mmu_cache(vma, fe->address, fe->pte);
- pte_unmap_unlock(fe->pte, fe->ptl);
-
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
-
- if (!page_mkwrite)
- lock_page(page);
-
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
-
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
- }
-
- return VM_FAULT_WRITE;
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- fe->address);
+ vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, fe->address, vma);
+ cow_user_page(new_page, old_page, vmf->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte))) {
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, fe->address, fe->pte, entry);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2263,79 +2269,91 @@ oom:
return VM_FAULT_OOM;
}
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- struct vm_fault vmf = {
- .page = NULL,
- .pgoff = linear_page_index(vma, fe->address),
- .virtual_address =
- (void __user *)(fe->address & PAGE_MASK),
- .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
- };
int ret;
- pte_unmap_unlock(fe->pte, fe->ptl);
- ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
- if (ret & VM_FAULT_ERROR)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- /*
- * We might have raced with another page fault while we
- * released the pte_offset_map_lock.
- */
- if (!pte_same(*fe->pte, orig_pte)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
- }
+ return finish_mkwrite_fault(vmf);
}
- return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
- __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- int page_mkwrite = 0;
+ struct vm_area_struct *vma = vmf->vma;
- get_page(old_page);
+ get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(fe->pte, fe->ptl);
- tmp = do_page_mkwrite(vma, old_page, fe->address);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(old_page);
+ put_page(vmf->page);
return tmp;
}
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
- return 0;
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
}
- page_mkwrite = 1;
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
}
+ fault_dirty_shared_page(vma, vmf->page);
+ put_page(vmf->page);
- return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+ return VM_FAULT_WRITE;
}
/*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
- __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *old_page;
+ struct vm_area_struct *vma = vmf->vma;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
- if (!old_page) {
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(fe, orig_pte);
+ return wp_pfn_shared(vmf);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
int total_mapcount;
- if (!trylock_page(old_page)) {
- get_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
+ if (!trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ lock_page(vmf->page);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ unlock_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(vmf->page);
return 0;
}
- put_page(old_page);
+ put_page(vmf->page);
}
- if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (reuse_swap_page(vmf->page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
* Protected against the rmap code by
* the page lock.
*/
- page_move_anon_rmap(old_page, vma);
+ page_move_anon_rmap(vmf->page, vma);
}
- unlock_page(old_page);
- return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+ unlock_page(vmf->page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
- unlock_page(old_page);
+ unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(fe, orig_pte, old_page);
+ return wp_page_shared(vmf);
}
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(old_page);
+ get_page(vmf->page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- entry = pte_to_swp_entry(orig_pte);
+ entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, fe->address, orig_pte, NULL);
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, fe->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
swapcache = page;
- locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, fe->address);
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
/*
* Back out if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (unlikely(!pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- fe->flags &= ~FAULT_FLAG_WRITE;
+ vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
+ if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ vmf->orig_pte = pte;
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
put_page(swapcache);
}
- if (fe->flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(fe, pte);
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, fe->address) < 0)
+ if (check_stack_guard_page(vma, vmf->address) < 0)
return VM_FAULT_SIGSEGV;
/*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(fe->pmd)))
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
- struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct vm_fault vmf;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
- vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = fe->flags;
- vmf.page = NULL;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.cow_page = cow_page;
-
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ ret = vma->vm_ops->fault(vma, vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
return ret;
- if (ret & VM_FAULT_DAX_LOCKED) {
- *entry = vmf.entry;
- return ret;
- }
- if (unlikely(PageHWPoison(vmf.page))) {
+ if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- put_page(vmf.page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf->page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
- *page = vmf.page;
return ret;
}
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- if (!pmd_none(*fe->pmd))
+ if (!pmd_none(*vmf->pmd))
goto map_pte;
- if (fe->prealloc_pte) {
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
- pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
- spin_unlock(fe->ptl);
- fe->prealloc_pte = 0;
- } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = 0;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return VM_FAULT_NOPAGE;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
return 0;
}
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
-static void deposit_prealloc_pte(struct fault_env *fe)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
- fe->prealloc_pte = 0;
+ vmf->prealloc_pte = 0;
}
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
- if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
- fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,28 +2998,21 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
- deposit_prealloc_pte(fe);
+ deposit_prealloc_pte(vmf);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, haddr, fe->pmd);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
- /*
- * If we are going to fallback to pte mapping, do a
- * withdraw with pmd lock held.
- */
- if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
- fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
- fe->pmd);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
#else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -3022,44 +3023,43 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @fe: fault environment
+ * @vmf: fault environment
* @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
int ret;
- if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
- ret = do_set_pmd(fe, page);
+ ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
- goto fault_handled;
+ return ret;
}
- if (!fe->pte) {
- ret = pte_alloc_one_map(fe);
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
if (ret)
- goto fault_handled;
+ return ret;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*fe->pte))) {
- ret = VM_FAULT_NOPAGE;
- goto fault_handled;
- }
+ if (unlikely(!pte_none(*vmf->pte)))
+ return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
@@ -3068,25 +3068,50 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, fe->address, fe->pte);
- ret = 0;
+ update_mmu_cache(vma, vmf->address, vmf->pte);
-fault_handled:
- /* preallocated pagetable is unused: free it */
- if (fe->prealloc_pte) {
- pte_free(fe->vma->vm_mm, fe->prealloc_pte);
- fe->prealloc_pte = 0;
- }
+ return 0;
+}
+
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int ret;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
@@ -3154,17 +3179,18 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = fe->address, nr_pages, mask;
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- fe->address = max(address & mask, fe->vma->vm_start);
- off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3172,45 +3198,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
- if (pmd_none(*fe->pmd)) {
- fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*fe->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!fe->pte)
+ if (!vmf->pte)
goto out;
/* check if the page fault is solved */
- fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*fe->pte))
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- fe->address = address;
- fe->pte = NULL;
+ vmf->address = address;
+ vmf->pte = NULL;
return ret;
}
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
+ struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
@@ -3219,80 +3245,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(fe, pgoff);
+ ret = do_fault_around(vmf);
if (ret)
return ret;
}
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- unlock_page(fault_page);
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(fault_page);
+ put_page(vmf->page);
return ret;
}
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page, *new_page;
- void *fault_entry;
- struct mem_cgroup *memcg;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
- if (!new_page)
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- put_page(new_page);
+ if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ &vmf->memcg, false)) {
+ put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
- if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, fe->address, vma);
- __SetPageUptodate(new_page);
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
- ret |= alloc_set_pte(fe, memcg, new_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
- }
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg, false);
- put_page(new_page);
+ mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+ put_page(vmf->cow_page);
return ret;
}
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
- struct address_space *mapping;
- int dirtied = 0;
+ struct vm_area_struct *vma = vmf->vma;
int ret, tmp;
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3301,46 +3314,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, fe->address);
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(fault_page);
+ put_page(vmf->page);
return tmp;
}
}
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
+ ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(fault_page);
- put_page(fault_page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
return ret;
}
- if (set_page_dirty(fault_page))
- dirtied = 1;
- /*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
- * release semantics to prevent the compiler from undoing this copying.
- */
- mapping = page_rmapping(fault_page);
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
+ fault_dirty_shared_page(vma, vmf->page);
return ret;
}
@@ -3350,19 +3341,27 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- pgoff_t pgoff = linear_page_index(vma, fe->address);
+ struct vm_area_struct *vma = vmf->vma;
+ int ret;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
- return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ ret = VM_FAULT_SIGBUS;
+ else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ ret = do_read_fault(vmf);
+ else if (!(vma->vm_flags & VM_SHARED))
+ ret = do_cow_fault(vmf);
+ else
+ ret = do_shared_fault(vmf);
+
+ /* preallocated pagetable is unused: free it */
+ if (vmf->prealloc_pte) {
+ pte_free(vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = 0;
+ }
+ return ret;
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3379,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = -1;
int last_cpupid;
int target_nid;
bool migrated = false;
+ pte_t pte = vmf->orig_pte;
bool was_writable = pte_write(pte);
int flags = 0;
@@ -3400,10 +3400,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, pte))) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -3412,18 +3412,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3447,9 +3447,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3469,28 +3469,28 @@ out:
return 0;
}
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(fe);
+ return do_huge_pmd_anonymous_page(vmf);
if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
- fe->flags);
+ return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
+ vmf->flags);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(fe->vma))
- return do_huge_pmd_wp_page(fe, orig_pmd);
- if (fe->vma->vm_ops->pmd_fault)
- return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ if (vmf->vma->vm_ops->pmd_fault)
+ return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
+ vmf->pmd, vmf->flags);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
- __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -3515,21 +3515,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
- if (unlikely(pmd_none(*fe->pmd))) {
+ if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
- fe->pte = NULL;
+ vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3537,8 @@ static int handle_pte_fault(struct fault_env *fe)
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- fe->pte = pte_offset_map(fe->pmd, fe->address);
-
- entry = *fe->pte;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
@@ -3550,38 +3549,39 @@ static int handle_pte_fault(struct fault_env *fe)
* ptl lock held. So here a barrier will do.
*/
barrier();
- if (pte_none(entry)) {
- pte_unmap(fe->pte);
- fe->pte = NULL;
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
}
}
- if (!fe->pte) {
- if (vma_is_anonymous(fe->vma))
- return do_anonymous_page(fe);
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
else
- return do_fault(fe);
+ return do_fault(vmf);
}
- if (!pte_present(entry))
- return do_swap_page(fe, entry);
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
- if (pte_protnone(entry) && vma_is_accessible(fe->vma))
- return do_numa_page(fe, entry);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, entry)))
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
- if (fe->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(fe, entry);
+ return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
- fe->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(fe->vma, fe->address, fe->pte);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3589,11 +3589,11 @@ static int handle_pte_fault(struct fault_env *fe)
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (fe->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3606,10 +3606,12 @@ unlock:
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = address & PAGE_MASK,
.flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -3619,35 +3621,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- fe.pmd = pmd_alloc(mm, pud, address);
- if (!fe.pmd)
+ vmf.pmd = pmd_alloc(mm, pud, address);
+ if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&fe);
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *fe.pmd;
+ pmd_t orig_pmd = *vmf.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&fe, orig_pmd);
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((fe.flags & FAULT_FLAG_WRITE) &&
+ if ((vmf.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&fe, orig_pmd);
+ ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&fe, orig_pmd);
+ huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
- return handle_pte_fault(&fe);
+ return handle_pte_fault(&vmf);
}
/*
@@ -3770,8 +3772,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
@@ -3788,11 +3790,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
- /* We cannot handle huge page PFN maps. Luckily they don't exist. */
- if (pmd_huge(*pmd))
+ if (pmd_huge(*pmd)) {
+ if (!pmdpp)
+ goto out;
+
+ *ptlp = pmd_lock(mm, pmd);
+ if (pmd_huge(*pmd)) {
+ *pmdpp = pmd;
+ return 0;
+ }
+ spin_unlock(*ptlp);
+ }
+
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
@@ -3815,9 +3826,23 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
+ ptlp)));
+ return res;
+}
+
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
+ ptlp)));
return res;
}
+EXPORT_SYMBOL(follow_pte_pmd);
/**
* follow_pfn - look up PFN at a user virtual address
@@ -3904,7 +3929,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
*/
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
@@ -3919,7 +3944,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct page *page = NULL;
ret = get_user_pages_remote(tsk, mm, addr, 1,
- gup_flags, &page, &vma);
+ gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
@@ -4002,6 +4027,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
return ret;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
/*
* Print the name of a VMA.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e43142c15631..b8c11e063ff0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1033,36 +1033,39 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
- enum zone_type target)
+bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+ enum zone_type target, int *zone_shift)
{
struct zone *zone = page_zone(pfn_to_page(pfn));
enum zone_type idx = zone_idx(zone);
int i;
+ *zone_shift = 0;
+
if (idx < target) {
/* pages must be at end of current zone */
if (pfn + nr_pages != zone_end_pfn(zone))
- return 0;
+ return false;
/* no zones in use between current zone and target */
for (i = idx + 1; i < target; i++)
if (zone_is_initialized(zone - idx + i))
- return 0;
+ return false;
}
if (target < idx) {
/* pages must be at beginning of current zone */
if (pfn != zone->zone_start_pfn)
- return 0;
+ return false;
/* no zones in use between current zone and target */
for (i = target + 1; i < idx; i++)
if (zone_is_initialized(zone - idx + i))
- return 0;
+ return false;
}
- return target - idx;
+ *zone_shift = target - idx;
+ return true;
}
/* Must be protected by mem_hotplug_begin() */
@@ -1089,10 +1092,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
!can_online_high_movable(zone))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL)
- zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL);
- else if (online_type == MMOP_ONLINE_MOVABLE)
- zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE);
+ if (online_type == MMOP_ONLINE_KERNEL) {
+ if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
+ return -EINVAL;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
+ return -EINVAL;
+ }
zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
if (!zone)
@@ -1477,17 +1483,20 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
+ * When true, return its valid [start, end).
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long *valid_start, unsigned long *valid_end)
{
unsigned long pfn, sec_end_pfn;
+ unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
- for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn);
+ for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
pfn < end_pfn;
- pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) {
+ pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
/* Make sure the memory section is present first */
if (!present_section_nr(pfn_to_section_nr(pfn)))
continue;
@@ -1503,10 +1512,20 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
+ if (!zone)
+ start = pfn + i;
zone = page_zone(page);
+ end = pfn + MAX_ORDER_NR_PAGES;
}
}
- return 1;
+
+ if (zone) {
+ *valid_start = start;
+ *valid_end = end;
+ return 1;
+ } else {
+ return 0;
+ }
}
/*
@@ -1833,6 +1852,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
long offlined_pages;
int ret, drain, retry_max, node;
unsigned long flags;
+ unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
@@ -1843,10 +1863,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
return -EINVAL;
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn))
+ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
return -EINVAL;
- zone = page_zone(pfn_to_page(start_pfn));
+ zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6d3639e1f254..1e7873e40c9a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -96,7 +96,7 @@
#include <linux/printk.h>
#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
@@ -2017,8 +2017,8 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
zl = policy_zonelist(gfp, pol, node);
- mpol_cond_put(pol);
page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+ mpol_cond_put(pol);
out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
diff --git a/mm/migrate.c b/mm/migrate.c
index 0ed24b1fa77b..87f4d0f81819 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -466,13 +466,15 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- __SetPageSwapBacked(newpage);
-
get_page(newpage); /* add cache reference */
- if (PageSwapCache(page)) {
- SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
+ if (PageSwapBacked(page)) {
+ __SetPageSwapBacked(newpage);
+ if (PageSwapCache(page)) {
+ SetPageSwapCache(newpage);
+ set_page_private(newpage, page_private(page));
+ }
+ } else {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
}
/* Move dirty while page refs frozen and newpage not yet exposed */
diff --git a/mm/mincore.c b/mm/mincore.c
index bfb866435478..ddb872da3f5b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -16,7 +16,7 @@
#include <linux/swapops.h>
#include <linux/hugetlb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
diff --git a/mm/mmap.c b/mm/mmap.c
index 1af87c14183d..dc4291dcc99b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,7 +45,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cc2459c57f60..f9c07f54dd62 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -25,7 +25,7 @@
#include <linux/perf_event.h>
#include <linux/pkeys.h>
#include <linux/ksm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b8faaf2a9e9..24f9f5f39145 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -35,7 +35,7 @@
#include <linux/audit.h>
#include <linux/printk.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages_locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -1801,14 +1801,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
}
EXPORT_SYMBOL(filemap_map_pages);
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
@@ -1878,6 +1878,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
mmput(mm);
return len;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63ad903..290e8b7d3181 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1778,6 +1778,7 @@ pause:
pause,
start_time);
__set_current_state(TASK_KILLABLE);
+ wb->dirty_sleep = now;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
@@ -2105,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
#define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged;
-
- do {
- spin_lock_irq(&mapping->tree_lock);
- tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
- &start, end, WRITEBACK_TAG_BATCH,
- PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ unsigned long tagged = 0;
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
+ PAGECACHE_TAG_DIRTY) {
+ if (iter.index > end)
+ break;
+ radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+ PAGECACHE_TAG_TOWRITE);
+ tagged++;
+ if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
- WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
cond_resched();
- /* We check 'start' to handle wrapping when end == ~0UL */
- } while (tagged >= WRITEBACK_TAG_BATCH && start);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f64e7bcb43b7..f3e0c69a97b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1864,14 +1864,14 @@ int move_freepages(struct zone *zone,
#endif
for (page = start_page; page <= end_page;) {
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
if (!PageBuddy(page)) {
page++;
continue;
@@ -2583,30 +2583,22 @@ int __isolate_free_page(struct page *page, unsigned int order)
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
*/
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
- gfp_t flags)
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
{
#ifdef CONFIG_NUMA
- int local_nid = numa_node_id();
enum zone_stat_item local_stat = NUMA_LOCAL;
- if (unlikely(flags & __GFP_OTHER_NODE)) {
+ if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
- local_nid = preferred_zone->node;
- }
- if (z->node == local_nid) {
+ if (z->node == preferred_zone->node)
__inc_zone_state(z, NUMA_HIT);
- __inc_zone_state(z, local_stat);
- } else {
+ else {
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
+ __inc_zone_state(z, local_stat);
#endif
}
@@ -2674,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
}
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, gfp_flags);
+ zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
@@ -3531,12 +3523,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
+ enum compact_priority compact_priority;
enum compact_result compact_result;
- int compaction_retries = 0;
- int no_progress_loops = 0;
+ int compaction_retries;
+ int no_progress_loops;
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
+ unsigned int cpuset_mems_cookie;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3557,6 +3550,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
+retry_cpuset:
+ compaction_retries = 0;
+ no_progress_loops = 0;
+ compact_priority = DEF_COMPACT_PRIORITY;
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ /*
+ * We need to recalculate the starting point for the zonelist iterator
+ * because we might have used different nodemask in the fast path, or
+ * there was a cpuset modification and we are retrying - otherwise we
+ * could end up iterating over non-eligible zones endlessly.
+ */
+ ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, ac->nodemask);
+ if (!ac->preferred_zoneref->zone)
+ goto nopage;
+
+
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3716,6 +3726,13 @@ retry:
&compaction_retries))
goto retry;
+ /*
+ * It's possible we raced with cpuset update so the OOM would be
+ * premature (see below the nopage: label for full explanation).
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
@@ -3728,6 +3745,16 @@ retry:
}
nopage:
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ goto retry_cpuset;
+
warn_alloc(gfp_mask,
"page allocation failure: order:%u", order);
got_pg:
@@ -3742,7 +3769,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
struct page *page;
- unsigned int cpuset_mems_cookie;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
@@ -3779,9 +3805,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
-
/* Dirty zone balancing only done in the fast path */
ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -3792,8 +3815,13 @@ retry_cpuset:
*/
ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
ac.high_zoneidx, ac.nodemask);
- if (!ac.preferred_zoneref) {
+ if (!ac.preferred_zoneref->zone) {
page = NULL;
+ /*
+ * This might be due to race with cpuset_current_mems_allowed
+ * update, so make sure we retry with original nodemask in the
+ * slow path.
+ */
goto no_zone;
}
@@ -3802,6 +3830,7 @@ retry_cpuset:
if (likely(page))
goto out;
+no_zone:
/*
* Runtime PM, block IO and its error handling path can deadlock
* because I/O on the device might not complete.
@@ -3813,21 +3842,10 @@ retry_cpuset:
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
- if (cpusets_enabled())
+ if (unlikely(ac.nodemask != nodemask))
ac.nodemask = nodemask;
- page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-no_zone:
- /*
- * When updating a task's mems_allowed, it is possible to race with
- * parallel threads in such a way that an allocation can fail while
- * the mask is being updated. If a page allocation is about to fail,
- * check if the cpuset changed during allocation and if so, retry.
- */
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
- alloc_mask = gfp_mask;
- goto retry_cpuset;
- }
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3904,8 +3922,8 @@ EXPORT_SYMBOL(free_pages);
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
-static struct page *__page_frag_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
@@ -3925,8 +3943,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
return page;
}
-void *__alloc_page_frag(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count)) {
+ unsigned int order = compound_order(page);
+
+ if (order == 0)
+ free_hot_cold_page(page, false);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -3934,7 +3967,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
if (unlikely(!nc->va)) {
refill:
- page = __page_frag_refill(nc, gfp_mask);
+ page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
@@ -3977,19 +4010,19 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(__alloc_page_frag);
+EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
-void __free_page_frag(void *addr)
+void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
-EXPORT_SYMBOL(__free_page_frag);
+EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
@@ -7241,6 +7274,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .gfp_mask = GFP_KERNEL,
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_io.c b/mm/page_io.c
index a2651f58c86a..23f6d0d3470f 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -320,10 +320,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = -ENOMEM;
goto out;
}
- if (wbc->sync_mode == WB_SYNC_ALL)
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC);
- else
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index f696385bcc44..0686f566d347 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -886,7 +886,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
- if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
+ !is_power_of_2(align))) {
WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align);
return NULL;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..84d0c7eada2b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
ssize_t rc = 0;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- unsigned int flags = FOLL_REMOTE;
+ unsigned int flags = 0;
/* Work out address and page range required */
if (len == 0)
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
while (!rc && nr_pages && iov_iter_count(iter)) {
int pages = min(nr_pages, max_pages_per_loop);
+ int locked = 1;
size_t bytes;
/*
* Get the pages we're interested in. We must
- * add FOLL_REMOTE because task/mm might not
+ * access remotely because task/mm might not
* current/current->mm
*/
- pages = __get_user_pages_unlocked(task, mm, pa, pages,
- process_pages, flags);
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages_remote(task, mm, pa, pages, flags,
+ process_pages, NULL, &locked);
+ if (locked)
+ up_read(&mm->mmap_sem);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/shmem.c b/mm/shmem.c
index abd7403aba41..3a7587a0314d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,7 +71,7 @@ static struct vfsmount *shm_mnt;
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include "internal.h"
@@ -415,6 +415,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
LIST_HEAD(list), *pos, *next;
+ LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
struct page *page;
@@ -441,9 +442,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* Check if there's anything to gain */
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
- list_del_init(&info->shrinklist);
+ list_move(&info->shrinklist, &to_remove);
removed++;
- iput(inode);
goto next;
}
@@ -454,6 +454,13 @@ next:
}
spin_unlock(&sbinfo->shrinklist_lock);
+ list_for_each_safe(pos, next, &to_remove) {
+ info = list_entry(pos, struct shmem_inode_info, shrinklist);
+ inode = &info->vfs_inode;
+ list_del_init(&info->shrinklist);
+ iput(inode);
+ }
+
list_for_each_safe(pos, next, &list) {
int ret;
@@ -661,8 +668,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
swapped++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
@@ -1049,6 +1056,30 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
+static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned long found = -1;
+ unsigned int checked = 0;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, root, &iter, 0) {
+ if (*slot == item) {
+ found = iter.index;
+ break;
+ }
+ checked++;
+ if ((checked % 4096) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
@@ -1062,7 +1093,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
int error = 0;
radswap = swp_to_radix_entry(swap);
- index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ index = find_swap_entry(&mapping->page_tree, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -2447,8 +2478,8 @@ static void shmem_tag_pins(struct address_space *mapping)
}
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2517,8 +2548,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
spin_unlock_irq(&mapping->tree_lock);
continue_resched:
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -3188,7 +3219,6 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
- .readlink = generic_readlink,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3196,7 +3226,6 @@ static const struct inode_operations shmem_short_symlink_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
- .readlink = generic_readlink,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
diff --git a/mm/slab.c b/mm/slab.c
index 87b29e76cafd..4f2ec6bb46eb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -552,12 +552,7 @@ static void start_cpu_timer(int cpu)
{
struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
- /*
- * When this gets called from do_initcalls via cpucache_init(),
- * init_workqueues() has already run, so keventd will be setup
- * at that time.
- */
- if (keventd_up() && reap_work->work.func == NULL) {
+ if (reap_work->work.func == NULL) {
init_reap_node(cpu);
INIT_DEFERRABLE_WORK(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
@@ -2462,7 +2457,6 @@ union freelist_init_state {
unsigned int pos;
unsigned int *list;
unsigned int count;
- unsigned int rand;
};
struct rnd_state rnd_state;
};
@@ -2488,8 +2482,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
} else {
state->list = cachep->random_seq;
state->count = count;
- state->pos = 0;
- state->rand = rand;
+ state->pos = rand % count;
ret = true;
}
return ret;
@@ -2498,7 +2491,9 @@ static bool freelist_state_initialize(union freelist_init_state *state,
/* Get the next entry on the list and randomize it using a random shift */
static freelist_idx_t next_random_slot(union freelist_init_state *state)
{
- return (state->list[state->pos++] + state->rand) % state->count;
+ if (state->pos >= state->count)
+ state->pos = 0;
+ return state->list[state->pos++];
}
/* Swap two freelist entries */
diff --git a/mm/slub.c b/mm/slub.c
index 067598a00849..7ec0a965c6a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -496,10 +496,11 @@ static inline int check_valid_pointer(struct kmem_cache *s,
return 1;
}
-static void print_section(char *text, u8 *addr, unsigned int length)
+static void print_section(char *level, char *text, u8 *addr,
+ unsigned int length)
{
metadata_access_enable();
- print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
metadata_access_disable();
}
@@ -636,14 +637,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ s->red_left_pad);
else if (p > addr + 16)
- print_section("Bytes b4 ", p - 16, 16);
+ print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section("Object ", p, min_t(unsigned long, s->object_size,
- PAGE_SIZE));
+ print_section(KERN_ERR, "Object ", p,
+ min_t(unsigned long, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
if (s->offset)
@@ -658,7 +660,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, size_from_object(s) - off);
+ print_section(KERN_ERR, "Padding ", p + off,
+ size_from_object(s) - off);
dump_stack();
}
@@ -820,7 +823,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding ", end - remainder, remainder);
+ print_section(KERN_ERR, "Padding ", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
return 0;
@@ -973,7 +976,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
page->freelist);
if (!alloc)
- print_section("Object ", (void *)object,
+ print_section(KERN_INFO, "Object ", (void *)object,
s->object_size);
dump_stack();
@@ -1419,6 +1422,10 @@ static int init_cache_random_seq(struct kmem_cache *s)
int err;
unsigned long i, count = oo_objects(s->oo);
+ /* Bailout if already initialised */
+ if (s->random_seq)
+ return 0;
+
err = cache_random_seq_create(s, count, GFP_KERNEL);
if (err) {
pr_err("SLUB: Unable to initialize free list for %s\n",
diff --git a/mm/swap.c b/mm/swap.c
index 4dcf852e1e6d..844baedd2429 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,6 +69,7 @@ static void __page_cache_release(struct page *page)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
}
+ __ClearPageWaiters(page);
mem_cgroup_uncharge(page);
}
@@ -784,6 +785,7 @@ void release_pages(struct page **pages, int nr, bool cold)
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive(page);
+ __ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1c6e0321205d..4761701d1721 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
- if (count == 1 && !PageWriteback(page)) {
+ if (count != 1)
+ goto out;
+ if (!PageWriteback(page)) {
delete_from_swap_cache(page);
SetPageDirty(page);
+ } else {
+ swp_entry_t entry;
+ struct swap_info_struct *p;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p->flags & SWP_STABLE_WRITES) {
+ spin_unlock(&p->lock);
+ return false;
+ }
+ spin_unlock(&p->lock);
}
}
+out:
return count <= 1;
}
@@ -2448,6 +2462,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -ENOMEM;
goto bad_swap;
}
+
+ if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+ p->flags |= SWP_STABLE_WRITES;
+
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
int cpu;
diff --git a/mm/truncate.c b/mm/truncate.c
index fd97f1dbce29..dd7b24e083c5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -24,20 +24,12 @@
#include <linux/rmap.h>
#include "internal.h"
-static void clear_exceptional_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
+ void *entry)
{
struct radix_tree_node *node;
void **slot;
- /* Handled by shmem itself */
- if (shmem_mapping(mapping))
- return;
-
- if (dax_mapping(mapping)) {
- dax_delete_mapping_entry(mapping, index);
- return;
- }
spin_lock_irq(&mapping->tree_lock);
/*
* Regular page slots are stabilized by the page lock even
@@ -55,6 +47,56 @@ unlock:
spin_unlock_irq(&mapping->tree_lock);
}
+/*
+ * Unconditionally remove exceptional entry. Usually called from truncate path.
+ */
+static void truncate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return;
+
+ if (dax_mapping(mapping)) {
+ dax_delete_mapping_entry(mapping, index);
+ return;
+ }
+ clear_shadow_entry(mapping, index, entry);
+}
+
+/*
+ * Invalidate exceptional entry if easily possible. This handles exceptional
+ * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
+ * clean entries.
+ */
+static int invalidate_exceptional_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
+/*
+ * Invalidate exceptional entry if clean. This handles exceptional entries for
+ * invalidate_inode_pages2() so for DAX it evicts only clean entries.
+ */
+static int invalidate_exceptional_entry2(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ /* Handled by shmem itself */
+ if (shmem_mapping(mapping))
+ return 1;
+ if (dax_mapping(mapping))
+ return dax_invalidate_mapping_entry_sync(mapping, index);
+ clear_shadow_entry(mapping, index, entry);
+ return 1;
+}
+
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
@@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ truncate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
}
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ truncate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ invalidate_exceptional_entry(mapping, index,
+ page);
continue;
}
@@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
break;
if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
+ if (!invalidate_exceptional_entry2(mapping,
+ index, page))
+ ret = -EBUSY;
continue;
}
diff --git a/mm/util.c b/mm/util.c
index 1a41553db866..3cb2164f4099 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -13,7 +13,7 @@
#include <linux/vmalloc.h>
#include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "internal.h"
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5584384eabc..3ca82d44edd3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -32,7 +32,7 @@
#include <linux/llist.h>
#include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6aa5b01d3e75..532a2a750952 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
}
+unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int zone_idx)
+{
+ if (!mem_cgroup_disabled())
+ return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
+
+ return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
+ NR_ZONE_LRU_BASE + lru);
+}
+
/*
* Add a shrinker callback to be called from the vm.
*/
@@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
* be complete before mem_cgroup_update_lru_size due to a santity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
- enum lru_list lru, unsigned long *nr_zone_taken,
- unsigned long nr_taken)
+ enum lru_list lru, unsigned long *nr_zone_taken)
{
int zid;
@@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
continue;
__update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
- }
-
#ifdef CONFIG_MEMCG
- mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+ mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
#endif
+ }
+
}
/*
@@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
- update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
+ update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
@@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
if (!managed_zone(zone))
continue;
- inactive_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE));
- active_zone = zone_page_state(zone,
- NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
+ inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
+ active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
inactive -= min(inactive, inactive_zone);
active -= min(active, active_zone);
diff --git a/mm/workingset.c b/mm/workingset.c
index 241fa5d6b3b2..abb58ffa3c64 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -473,7 +473,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- __radix_tree_delete_node(&mapping->page_tree, node);
+ __radix_tree_delete_node(&mapping->page_tree, node,
+ workingset_update_node, mapping);
out_invalid:
spin_unlock(&mapping->tree_lock);
diff --git a/mm/zswap.c b/mm/zswap.c
index 067a0d62f318..cabf09e0128b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -78,7 +78,13 @@ static u64 zswap_duplicate_entry;
/* Enable/disable zswap (disabled by default) */
static bool zswap_enabled;
-module_param_named(enabled, zswap_enabled, bool, 0644);
+static int zswap_enabled_param_set(const char *,
+ const struct kernel_param *);
+static struct kernel_param_ops zswap_enabled_param_ops = {
+ .set = zswap_enabled_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
/* Crypto compressor to use */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
@@ -176,6 +182,9 @@ static atomic_t zswap_pools_count = ATOMIC_INIT(0);
/* used by param callback function */
static bool zswap_init_started;
+/* fatal error during init */
+static bool zswap_init_failed;
+
/*********************************
* helpers and fwd declarations
**********************************/
@@ -624,6 +633,11 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
char *s = strstrip((char *)val);
int ret;
+ if (zswap_init_failed) {
+ pr_err("can't set param, initialization failed\n");
+ return -ENODEV;
+ }
+
/* no change required */
if (!strcmp(s, *(char **)kp->arg))
return 0;
@@ -703,6 +717,17 @@ static int zswap_zpool_param_set(const char *val,
return __zswap_param_set(val, kp, NULL, zswap_compressor);
}
+static int zswap_enabled_param_set(const char *val,
+ const struct kernel_param *kp)
+{
+ if (zswap_init_failed) {
+ pr_err("can't enable, initialization failed\n");
+ return -ENODEV;
+ }
+
+ return param_set_bool(val, kp);
+}
+
/*********************************
* writeback code
**********************************/
@@ -1201,6 +1226,9 @@ hp_fail:
dstmem_fail:
zswap_entry_cache_destroy();
cache_fail:
+ /* if built-in, we aren't unloaded on failure; don't allow use */
+ zswap_init_failed = true;
+ zswap_enabled = false;
return -ENOMEM;
}
/* must be late so crypto has time to come up */