diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 1 | ||||
-rw-r--r-- | mm/compaction.c | 17 | ||||
-rw-r--r-- | mm/fadvise.c | 15 | ||||
-rw-r--r-- | mm/filemap.c | 245 | ||||
-rw-r--r-- | mm/gup.c | 20 | ||||
-rw-r--r-- | mm/huge_memory.c | 196 | ||||
-rw-r--r-- | mm/hugetlb.c | 37 | ||||
-rw-r--r-- | mm/init-mm.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 9 | ||||
-rw-r--r-- | mm/kasan/report.c | 3 | ||||
-rw-r--r-- | mm/khugepaged.c | 57 | ||||
-rw-r--r-- | mm/memcontrol.c | 24 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 922 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 56 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 14 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 15 | ||||
-rw-r--r-- | mm/page-writeback.c | 29 | ||||
-rw-r--r-- | mm/page_alloc.c | 126 | ||||
-rw-r--r-- | mm/page_io.c | 5 | ||||
-rw-r--r-- | mm/percpu.c | 3 | ||||
-rw-r--r-- | mm/process_vm_access.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 47 | ||||
-rw-r--r-- | mm/slab.c | 15 | ||||
-rw-r--r-- | mm/slub.c | 27 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/truncate.c | 75 | ||||
-rw-r--r-- | mm/util.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 27 | ||||
-rw-r--r-- | mm/workingset.c | 3 | ||||
-rw-r--r-- | mm/zswap.c | 30 |
38 files changed, 1251 insertions, 825 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fde443f36d7..3bfed5ab2475 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + wb->dirty_sleep = jiffies; wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); if (!wb->congested) diff --git a/mm/compaction.c b/mm/compaction.c index 223464227299..949198d01260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, page_count(page) > page_mapcount(page)) goto isolate_fail; + /* + * Only allow to migrate anonymous pages in GFP_NOFS context + * because those do not depend on fs locks. + */ + if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) + goto isolate_fail; + /* If we already hold the lock, we can skip some rechecking */ if (!locked) { locked = compact_trylock_irqsave(zone_lru_lock(zone), @@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio) { - int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* Check if the GFP flags allow compaction */ - if (!may_enter_fs || !may_perform_io) + /* + * Check if the GFP flags allow compaction - GFP_NOIO is really + * tricky context because the migration might require IO + */ + if (!may_perform_io) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); @@ -1751,6 +1760,7 @@ static void compact_node(int nid) .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, + .gfp_mask = GFP_KERNEL, }; @@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, + .gfp_mask = GFP_KERNEL, }; trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, diff --git a/mm/fadvise.c b/mm/fadvise.c index 6c707bfe02fd..a43013112581 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -139,7 +139,20 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) } if (end_index >= start_index) { - unsigned long count = invalidate_mapping_pages(mapping, + unsigned long count; + + /* + * It's common to FADV_DONTNEED right after + * the read or write that instantiates the + * pages, in which case there will be some + * sitting on the local LRU cache. Try to + * avoid the expensive remote drain and the + * second cache tree walk below by flushing + * them out right away. + */ + lru_add_drain(); + + count = invalidate_mapping_pages(mapping, start_index, end_index); /* diff --git a/mm/filemap.c b/mm/filemap.c index 5b4dd03130da..3f9afded581b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,11 +135,10 @@ static int page_cache_tree_insert(struct address_space *mapping, } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != - (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK)); + dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, - false); + dax_wake_mapping_entry_waiter(mapping, page->index, p, + true); } } __radix_tree_replace(&mapping->page_tree, node, slot, page, @@ -740,45 +739,159 @@ EXPORT_SYMBOL(__page_cache_alloc); * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -wait_queue_head_t *page_waitqueue(struct page *page) +#define PAGE_WAIT_TABLE_BITS 8 +#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) +static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; + +static wait_queue_head_t *page_waitqueue(struct page *page) { - return bit_waitqueue(page, 0); + return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)]; } -EXPORT_SYMBOL(page_waitqueue); -void wait_on_page_bit(struct page *page, int bit_nr) +void __init pagecache_init(void) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + int i; + + for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(&page_wait_table[i]); - if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, - TASK_UNINTERRUPTIBLE); + page_writeback_init(); } -EXPORT_SYMBOL(wait_on_page_bit); -int wait_on_page_bit_killable(struct page *page, int bit_nr) +struct wait_page_key { + struct page *page; + int bit_nr; + int page_match; +}; + +struct wait_page_queue { + struct page *page; + int bit_nr; + wait_queue_t wait; +}; + +static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + struct wait_page_key *key = arg; + struct wait_page_queue *wait_page + = container_of(wait, struct wait_page_queue, wait); - if (!test_bit(bit_nr, &page->flags)) + if (wait_page->page != key->page) + return 0; + key->page_match = 1; + + if (wait_page->bit_nr != key->bit_nr) + return 0; + if (test_bit(key->bit_nr, &key->page->flags)) return 0; - return __wait_on_bit(page_waitqueue(page), &wait, - bit_wait_io, TASK_KILLABLE); + return autoremove_wake_function(wait, mode, sync, key); } -int wait_on_page_bit_killable_timeout(struct page *page, - int bit_nr, unsigned long timeout) +void wake_up_page_bit(struct page *page, int bit_nr) { - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + wait_queue_head_t *q = page_waitqueue(page); + struct wait_page_key key; + unsigned long flags; - wait.key.timeout = jiffies + timeout; - if (!test_bit(bit_nr, &page->flags)) - return 0; - return __wait_on_bit(page_waitqueue(page), &wait, - bit_wait_io_timeout, TASK_KILLABLE); + key.page = page; + key.bit_nr = bit_nr; + key.page_match = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_locked_key(q, TASK_NORMAL, &key); + /* + * It is possible for other pages to have collided on the waitqueue + * hash, so in that case check for a page match. That prevents a long- + * term waiter + * + * It is still possible to miss a case here, when we woke page waiters + * and removed them from the waitqueue, but there are still other + * page waiters. + */ + if (!waitqueue_active(q) || !key.page_match) { + ClearPageWaiters(page); + /* + * It's possible to miss clearing Waiters here, when we woke + * our page waiters, but the hashed waitqueue has waiters for + * other pages on it. + * + * That's okay, it's a rare case. The next waker will clear it. + */ + } + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(wake_up_page_bit); + +static inline int wait_on_page_bit_common(wait_queue_head_t *q, + struct page *page, int bit_nr, int state, bool lock) +{ + struct wait_page_queue wait_page; + wait_queue_t *wait = &wait_page.wait; + int ret = 0; + + init_wait(wait); + wait->func = wake_page_function; + wait_page.page = page; + wait_page.bit_nr = bit_nr; + + for (;;) { + spin_lock_irq(&q->lock); + + if (likely(list_empty(&wait->task_list))) { + if (lock) + __add_wait_queue_tail_exclusive(q, wait); + else + __add_wait_queue(q, wait); + SetPageWaiters(page); + } + + set_current_state(state); + + spin_unlock_irq(&q->lock); + + if (likely(test_bit(bit_nr, &page->flags))) { + io_schedule(); + if (unlikely(signal_pending_state(state, current))) { + ret = -EINTR; + break; + } + } + + if (lock) { + if (!test_and_set_bit_lock(bit_nr, &page->flags)) + break; + } else { + if (!test_bit(bit_nr, &page->flags)) + break; + } + } + + finish_wait(q, wait); + + /* + * A signal could leave PageWaiters set. Clearing it here if + * !waitqueue_active would be possible (by open-coding finish_wait), + * but still fail to catch it in the case of wait hash collision. We + * already can fail to clear wait hash collision cases, so don't + * bother with signals either. + */ + + return ret; +} + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + wait_queue_head_t *q = page_waitqueue(page); + wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false); +} +EXPORT_SYMBOL(wait_on_page_bit); + +int wait_on_page_bit_killable(struct page *page, int bit_nr) +{ + wait_queue_head_t *q = page_waitqueue(page); + return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false); } -EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue @@ -794,10 +907,34 @@ void add_page_wait_queue(struct page *page, wait_queue_t *waiter) spin_lock_irqsave(&q->lock, flags); __add_wait_queue(q, waiter); + SetPageWaiters(page); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(add_page_wait_queue); +#ifndef clear_bit_unlock_is_negative_byte + +/* + * PG_waiters is the high bit in the same byte as PG_lock. + * + * On x86 (and on many other architectures), we can clear PG_lock and + * test the sign bit at the same time. But if the architecture does + * not support that special operation, we just do this all by hand + * instead. + * + * The read of PG_waiters has to be after (or concurrently with) PG_locked + * being cleared, but a memory barrier should be unneccssary since it is + * in the same byte as PG_locked. + */ +static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) +{ + clear_bit_unlock(nr, mem); + /* smp_mb__after_atomic(); */ + return test_bit(PG_waiters, mem); +} + +#endif + /** * unlock_page - unlock a locked page * @page: the page @@ -807,16 +944,19 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); * mechanism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The mb is necessary to enforce ordering between the clear_bit and the read - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). + * Note that this depends on PG_waiters being the sign bit in the byte + * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to + * clear the PG_locked bit and test PG_waiters at the same time fairly + * portably (architectures that do LL/SC can test any bit, while x86 can + * test the sign bit). */ void unlock_page(struct page *page) { + BUILD_BUG_ON(PG_waiters != 7); page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); - clear_bit_unlock(PG_locked, &page->flags); - smp_mb__after_atomic(); - wake_up_page(page, PG_locked); + if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags)) + wake_up_page_bit(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -875,23 +1015,19 @@ EXPORT_SYMBOL_GPL(page_endio); * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock */ -void __lock_page(struct page *page) +void __lock_page(struct page *__page) { - struct page *page_head = compound_head(page); - DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - - __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, - TASK_UNINTERRUPTIBLE); + struct page *page = compound_head(__page); + wait_queue_head_t *q = page_waitqueue(page); + wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true); } EXPORT_SYMBOL(__lock_page); -int __lock_page_killable(struct page *page) +int __lock_page_killable(struct page *__page) { - struct page *page_head = compound_head(page); - DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - - return __wait_on_bit_lock(page_waitqueue(page_head), &wait, - bit_wait_io, TASK_KILLABLE); + struct page *page = compound_head(__page); + wait_queue_head_t *q = page_waitqueue(page); + return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -1638,7 +1774,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, int error = 0; if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) - return -EINVAL; + return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); index = *ppos >> PAGE_SHIFT; @@ -1655,6 +1791,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, cond_resched(); find_page: + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out; + } + page = find_get_page(mapping, index); if (!page) { page_cache_sync_readahead(mapping, @@ -2165,12 +2306,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2226,11 +2367,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2240,7 +2381,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); * caller if required (just like with __get_user_pages). "FOLL_GET" * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf05477..5f3ad65c85de 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -783,6 +783,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); + /* + * When we COW a devmap PMD entry, we split it into PTEs, so we should + * not be in this function with `flags & FOLL_COW` set. + */ + WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); + if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; @@ -879,30 +885,32 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + if (write) + entry = pmd_mkdirty(entry); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -919,9 +927,8 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -952,15 +959,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -969,20 +976,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -993,7 +1000,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -1005,23 +1012,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1034,13 +1041,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1053,12 +1060,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1070,7 +1077,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1090,11 +1097,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1102,12 +1109,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1117,16 +1124,26 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } +/* + * FOLL_FORCE can write to even unwritable pmd's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +{ + return pmd_write(pmd) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); +} + struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -1137,7 +1154,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !pmd_write(*pmd)) + if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) goto out; /* Avoid dumping huge zero page */ @@ -1196,12 +1213,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1209,8 +1226,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1218,9 +1235,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1253,7 +1270,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1264,12 +1281,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1287,9 +1304,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1304,18 +1321,19 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + vmf->flags); return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3edb759c5c7d..c7025c132670 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1773,23 +1773,32 @@ free: } /* - * When releasing a hugetlb pool reservation, any surplus pages that were - * allocated to satisfy the reservation must be explicitly freed if they were - * never used. - * Called with hugetlb_lock held. + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + * + * Called with hugetlb_lock held. However, the lock could be dropped (and + * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, + * we must make sure nobody else can claim pages we are in the process of + * freeing. Do this by ensuring resv_huge_page always is greater than the + * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - /* Uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) - return; + goto out; + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* @@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages(struct hstate *h, * when the nodes with surplus pages have no free pages. * free_pool_huge_page() will balance the the freed pages across the * on-line nodes with memory and will handle the hstate accounting. + * + * Note that we decrement resv_huge_pages as we free the pages. If + * we drop the lock, resv_huge_pages will still be sufficiently large + * to cover subsequent pages we may free. */ while (nr_pages--) { + h->resv_huge_pages--; + unused_resv_pages--; if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) - break; + goto out; cond_resched_lock(&hugetlb_lock); } + +out: + /* Fully uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; } diff --git a/mm/init-mm.c b/mm/init-mm.c index a56a851908d2..975e49f00f34 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -6,6 +6,7 @@ #include <linux/cpumask.h> #include <linux/atomic.h> +#include <linux/user_namespace.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -21,5 +22,6 @@ struct mm_struct init_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..7aa2ea0a8623 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,9 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +void page_writeback_init(void); + +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 0e9505f66ec1..b2a0cff2bb35 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task) /* Unpoison the stack for the current task beyond a watermark sp value. */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, watermark); + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison_shadow(base, watermark - base); } /* diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b82b3e215157..f479365530b6 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,6 +13,7 @@ * */ +#include <linux/ftrace.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/printk.h> @@ -300,6 +301,8 @@ void kasan_report(unsigned long addr, size_t size, if (likely(!kasan_report_enabled())) return; + disable_trace_on_warning(); + info.access_addr = (void *)addr; info.access_size = size; info.is_write = is_write; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..77ae3239c3de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, + .pgoff = linear_page_index(vma, address), }; /* we only decide to swapin, if there is enough young ptes */ @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; - if (!is_swap_pte(pteval)) + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } @@ -943,7 +943,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; /* * Before allocating the hugepage, release the mmap_sem read lock. @@ -1242,7 +1242,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; - bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1267,26 +1266,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - /* - * now deposit the pgtable for arch that need it - * otherwise free it. - */ - if (arch_needs_pgtable_deposit()) { - /* - * The deposit should be visibile only after - * collapse is seen by others. - */ - smp_wmb(); - pgtable_trans_huge_deposit(vma->vm_mm, pmd, - pmd_pgtable(_pmd)); - deposited = true; - } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - if (!deposited) { - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); - } + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } i_mmap_unlock_write(mapping); @@ -1326,8 +1309,7 @@ static void collapse_shmem(struct mm_struct *mm, VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | - __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { @@ -1446,7 +1428,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: @@ -1546,7 +1528,6 @@ tree_unlocked: /* Put holes back where they were */ radix_tree_delete(&mapping->page_tree, iter.index); - slot = radix_tree_iter_next(&iter); continue; } @@ -1557,11 +1538,11 @@ tree_unlocked: page_ref_unfreeze(page, 2); radix_tree_replace_slot(&mapping->page_tree, slot, page); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); - slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); @@ -1641,8 +1622,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 175ec51c346d..b822e158b319 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -68,7 +68,7 @@ #include <net/ip.h> #include "slab.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <trace/events/vmscan.h> @@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; - struct mem_cgroup_per_node *mz; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); @@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - mz = mem_cgroup_nodeinfo(memcg, nid); - nr += mz->lru_size[lru]; + nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; } @@ -1002,6 +1001,7 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -1009,27 +1009,25 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + int zid, int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; - bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - lru_size = mz->lru_size + lru; - empty = list_empty(lruvec->lists + lru); + lru_size = &mz->lru_zone_size[zid][lru]; if (nr_pages < 0) *lru_size += nr_pages; size = *lru_size; - if (WARN_ONCE(size < 0 || empty != !size, - "%s(%p, %d, %d): lru_size %ld but %sempty\n", - __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; } @@ -4355,9 +4353,9 @@ static int mem_cgroup_do_precharge(unsigned long count) return ret; } - /* Try charges one by one with reclaim */ + /* Try charges one by one with reclaim, but do not retry */ while (count--) { - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); if (ret) return ret; mc.precharge++; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 19e796d36a62..f283c7e0a2a3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -764,12 +764,11 @@ static int me_huge_page(struct page *p, unsigned long pfn) */ #define dirty (1UL << PG_dirty) -#define sc (1UL << PG_swapcache) +#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) #define unevict (1UL << PG_unevictable) #define mlock (1UL << PG_mlocked) #define writeback (1UL << PG_writeback) #define lru (1UL << PG_lru) -#define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -819,7 +818,6 @@ static struct page_state { #undef mlock #undef writeback #undef lru -#undef swapbacked #undef head #undef slab #undef reserved diff --git a/mm/memory.c b/mm/memory.c index 32e9b7aec366..6bf2b471e30c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -68,7 +68,7 @@ #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/pgtable.h> @@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static int do_page_mkwrite(struct vm_fault *vmf) { - struct vm_fault vmf; int ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); - vmf.pgoff = page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.page = page; - vmf.cow_page = NULL; + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->page_mkwrite(vma, &vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { @@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static void fault_dirty_shared_page(struct vm_area_struct *vma, + struct page *page) +{ + struct address_space *mapping; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); +} + +/* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, @@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) +static inline void wp_page_reuse(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); - - if (dirty_shared) { - struct address_space *mapping; - int dirtied; - - if (!page_mkwrite) - lock_page(page); - - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); - mapping = page->mapping; - unlock_page(page); - put_page(page); - - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!page_mkwrite) - file_update_time(vma->vm_file); - } - - return VM_FAULT_WRITE; + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); } /* @@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2263,79 +2269,91 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return VM_FAULT_NOPAGE; + } + wp_page_reuse(vmf); + return 0; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { - .page = NULL, - .pgoff = linear_page_index(vma, fe->address), - .virtual_address = - (void __user *)(fe->address & PAGE_MASK), - .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, - }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); - if (ret & VM_FAULT_ERROR) + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vma, vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, - struct page *old_page) - __releases(fe->ptl) +static int wp_page_shared(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - int page_mkwrite = 0; + struct vm_area_struct *vma = vmf->vma; - get_page(old_page); + get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(old_page); + put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); - return 0; + tmp = finish_mkwrite_fault(vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + unlock_page(vmf->page); + put_page(vmf->page); + return tmp; } - page_mkwrite = 1; + } else { + wp_page_reuse(vmf); + lock_page(vmf->page); } + fault_dirty_shared_page(vma, vmf->page); + put_page(vmf->page); - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return VM_FAULT_WRITE; } /* @@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; - struct page *old_page; + struct vm_area_struct *vma = vmf->vma; - old_page = vm_normal_page(vma, fe->address, orig_pte); - if (!old_page) { + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { int total_mapcount; - if (!trylock_page(old_page)) { - get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - put_page(old_page); + if (!trylock_page(vmf->page)) { + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + lock_page(vmf->page); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + unlock_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(vmf->page); return 0; } - put_page(old_page); + put_page(vmf->page); } - if (reuse_swap_page(old_page, &total_mapcount)) { + if (reuse_swap_page(vmf->page, &total_mapcount)) { if (total_mapcount == 1) { /* * The page is all ours. Move it to @@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) * Protected against the rmap code by * the page lock. */ - page_move_anon_rmap(old_page, vma); + page_move_anon_rmap(vmf->page, vma); } - unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + unlock_page(vmf->page); + wp_page_reuse(vmf); + return VM_FAULT_WRITE; } - unlock_page(old_page); + unlock_page(vmf->page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf); } /* * Ok, we need to copy. Oh, well.. */ - get_page(old_page); + get_page(vmf->page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2843,62 +2863,50 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, - struct page *cow_page, struct page **page, void **entry) +static int __do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; - - ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + ret = vma->vm_ops->fault(vma, vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) return ret; - if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; - return ret; - } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf->page); + put_page(vmf->page); + vmf->page = NULL; return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf->page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); - *page = vmf.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2913,11 +2921,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static void deposit_prealloc_pte(struct fault_env *fe) +static void deposit_prealloc_pte(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - fe->prealloc_pte = 0; + vmf->prealloc_pte = 0; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { - fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2990,28 +2998,21 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) - deposit_prealloc_pte(fe); + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: - /* - * If we are going to fallback to pte mapping, do a - * withdraw with pmd lock held. - */ - if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) - fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, - fe->pmd); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3022,44 +3023,43 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) - goto fault_handled; + return ret; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) - goto fault_handled; + return ret; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) { - ret = VM_FAULT_NOPAGE; - goto fault_handled; - } + if (unlikely(!pte_none(*vmf->pte))) + return VM_FAULT_NOPAGE; flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3068,25 +3068,50 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); - ret = 0; + update_mmu_cache(vma, vmf->address, vmf->pte); -fault_handled: - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } + return 0; +} + + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -3154,17 +3179,18 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3172,45 +3198,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; + struct vm_area_struct *vma = vmf->vma; int ret = 0; /* @@ -3219,80 +3245,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - unlock_page(fault_page); + ret |= finish_fault(vmf); + unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(fault_page); + put_page(vmf->page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page, *new_page; - void *fault_entry; - struct mem_cgroup *memcg; + struct vm_area_struct *vma = vmf->vma; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); - if (!new_page) + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { - put_page(new_page); + if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, + &vmf->memcg, false)) { + put_page(vmf->cow_page); return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); - __SetPageUptodate(new_page); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(fault_page); - put_page(fault_page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); - } + ret |= finish_fault(vmf); + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg, false); - put_page(new_page); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); + put_page(vmf->cow_page); return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - struct page *fault_page; - struct address_space *mapping; - int dirtied = 0; + struct vm_area_struct *vma = vmf->vma; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3301,46 +3314,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + unlock_page(vmf->page); + tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(fault_page); + put_page(vmf->page); return tmp; } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(fault_page); - put_page(fault_page); + unlock_page(vmf->page); + put_page(vmf->page); return ret; } - if (set_page_dirty(fault_page)) - dirtied = 1; - /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s - * release semantics to prevent the compiler from undoing this copying. - */ - mapping = page_rmapping(fault_page); - unlock_page(fault_page); - if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - - if (!vma->vm_ops->page_mkwrite) - file_update_time(vma->vm_file); - + fault_dirty_shared_page(vma, vmf->page); return ret; } @@ -3350,19 +3341,27 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; + int ret; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) - return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); - if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + ret = VM_FAULT_SIGBUS; + else if (!(vmf->flags & FAULT_FLAG_WRITE)) + ret = do_read_fault(vmf); + else if (!(vma->vm_flags & VM_SHARED)) + ret = do_cow_fault(vmf); + else + ret = do_shared_fault(vmf); + + /* preallocated pagetable is unused: free it */ + if (vmf->prealloc_pte) { + pte_free(vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = 0; + } + return ret; } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3380,14 +3379,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; int target_nid; bool migrated = false; + pte_t pte = vmf->orig_pte; bool was_writable = pte_write(pte); int flags = 0; @@ -3400,10 +3400,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -3412,18 +3412,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3447,9 +3447,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3469,28 +3469,28 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); + return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, + vmf->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, + vmf->pmd, vmf->flags); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3515,21 +3515,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3537,9 +3537,8 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); - - entry = *fe->pte; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3550,38 +3549,39 @@ static int handle_pte_fault(struct fault_env *fe) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + if (pte_none(vmf->orig_pte)) { + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(fe, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3589,11 +3589,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3606,10 +3606,12 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, + .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -3619,35 +3621,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + vmf.pmd = pmd_alloc(mm, pud, address); + if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; + pmd_t orig_pmd = *vmf.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* @@ -3770,8 +3772,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -static int __follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; @@ -3788,11 +3790,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, pmd = pmd_offset(pud, address); VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - /* We cannot handle huge page PFN maps. Luckily they don't exist. */ - if (pmd_huge(*pmd)) + if (pmd_huge(*pmd)) { + if (!pmdpp) + goto out; + + *ptlp = pmd_lock(mm, pmd); + if (pmd_huge(*pmd)) { + *pmdpp = pmd; + return 0; + } + spin_unlock(*ptlp); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); @@ -3815,9 +3826,23 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte(mm, address, ptepp, ptlp))); + !(res = __follow_pte_pmd(mm, address, ptepp, NULL, + ptlp))); + return res; +} + +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, + ptlp))); return res; } +EXPORT_SYMBOL(follow_pte_pmd); /** * follow_pfn - look up PFN at a user virtual address @@ -3904,7 +3929,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys); * Access another process' address space as given in mm. If non-NULL, use the * given task for page fault accounting. */ -static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; @@ -3919,7 +3944,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; @@ -4002,6 +4027,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, return ret; } +EXPORT_SYMBOL_GPL(access_process_vm); /* * Print the name of a VMA. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e43142c15631..b8c11e063ff0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1033,36 +1033,39 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -int zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target) +bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target, int *zone_shift) { struct zone *zone = page_zone(pfn_to_page(pfn)); enum zone_type idx = zone_idx(zone); int i; + *zone_shift = 0; + if (idx < target) { /* pages must be at end of current zone */ if (pfn + nr_pages != zone_end_pfn(zone)) - return 0; + return false; /* no zones in use between current zone and target */ for (i = idx + 1; i < target; i++) if (zone_is_initialized(zone - idx + i)) - return 0; + return false; } if (target < idx) { /* pages must be at beginning of current zone */ if (pfn != zone->zone_start_pfn) - return 0; + return false; /* no zones in use between current zone and target */ for (i = target + 1; i < idx; i++) if (zone_is_initialized(zone - idx + i)) - return 0; + return false; } - return target - idx; + *zone_shift = target - idx; + return true; } /* Must be protected by mem_hotplug_begin() */ @@ -1089,10 +1092,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ !can_online_high_movable(zone)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) - zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); - else if (online_type == MMOP_ONLINE_MOVABLE) - zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); + if (online_type == MMOP_ONLINE_KERNEL) { + if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) + return -EINVAL; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) + return -EINVAL; + } zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); if (!zone) @@ -1477,17 +1483,20 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) is belongs to the same zone. + * Confirm all pages in a range [start, end) belong to the same zone. + * When true, return its valid [start, end). */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *valid_start, unsigned long *valid_end) { unsigned long pfn, sec_end_pfn; + unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; - pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { /* Make sure the memory section is present first */ if (!present_section_nr(pfn_to_section_nr(pfn))) continue; @@ -1503,10 +1512,20 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; + if (!zone) + start = pfn + i; zone = page_zone(page); + end = pfn + MAX_ORDER_NR_PAGES; } } - return 1; + + if (zone) { + *valid_start = start; + *valid_end = end; + return 1; + } else { + return 0; + } } /* @@ -1833,6 +1852,7 @@ static int __ref __offline_pages(unsigned long start_pfn, long offlined_pages; int ret, drain, retry_max, node; unsigned long flags; + unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; @@ -1843,10 +1863,10 @@ static int __ref __offline_pages(unsigned long start_pfn, return -EINVAL; /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) return -EINVAL; - zone = page_zone(pfn_to_page(start_pfn)); + zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6d3639e1f254..1e7873e40c9a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -96,7 +96,7 @@ #include <linux/printk.h> #include <asm/tlbflush.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -2017,8 +2017,8 @@ retry_cpuset: nmask = policy_nodemask(gfp, pol); zl = policy_zonelist(gfp, pol, node); - mpol_cond_put(pol); page = __alloc_pages_nodemask(gfp, order, zl, nmask); + mpol_cond_put(pol); out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; diff --git a/mm/migrate.c b/mm/migrate.c index 0ed24b1fa77b..87f4d0f81819 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -466,13 +466,15 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - __SetPageSwapBacked(newpage); - get_page(newpage); /* add cache reference */ - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); + if (PageSwapBacked(page)) { + __SetPageSwapBacked(newpage); + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + } else { + VM_BUG_ON_PAGE(PageSwapCache(page), page); } /* Move dirty while page refs frozen and newpage not yet exposed */ diff --git a/mm/mincore.c b/mm/mincore.c index bfb866435478..ddb872da3f5b 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -16,7 +16,7 @@ #include <linux/swapops.h> #include <linux/hugetlb.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index 1af87c14183d..dc4291dcc99b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,7 +45,7 @@ #include <linux/moduleparam.h> #include <linux/pkeys.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> diff --git a/mm/mprotect.c b/mm/mprotect.c index cc2459c57f60..f9c07f54dd62 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -25,7 +25,7 @@ #include <linux/perf_event.h> #include <linux/pkeys.h> #include <linux/ksm.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> diff --git a/mm/nommu.c b/mm/nommu.c index 8b8faaf2a9e9..24f9f5f39145 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -35,7 +35,7 @@ #include <linux/audit.h> #include <linux/printk.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) @@ -1801,14 +1801,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); } EXPORT_SYMBOL(filemap_map_pages); -static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; @@ -1878,6 +1878,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } +EXPORT_SYMBOL_GPL(access_process_vm); /** * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 439cc63ad903..290e8b7d3181 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1778,6 +1778,7 @@ pause: pause, start_time); __set_current_state(TASK_KILLABLE); + wb->dirty_sleep = now; io_schedule_timeout(pause); current->dirty_paused_when = now + pause; @@ -2105,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + PAGECACHE_TAG_DIRTY) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(&mapping->page_tree, &iter, + PAGECACHE_TAG_TOWRITE); + tagged++; + if ((tagged % WRITEBACK_TAG_BATCH) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + spin_lock_irq(&mapping->tree_lock); + } + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(tag_pages_for_writeback); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f64e7bcb43b7..f3e0c69a97b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1864,14 +1864,14 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + if (!PageBuddy(page)) { page++; continue; @@ -2583,30 +2583,22 @@ int __isolate_free_page(struct page *page, unsigned int order) * Update NUMA hit/miss statistics * * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, - gfp_t flags) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA - int local_nid = numa_node_id(); enum zone_stat_item local_stat = NUMA_LOCAL; - if (unlikely(flags & __GFP_OTHER_NODE)) { + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - if (z->node == local_nid) { + if (z->node == preferred_zone->node) __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { + else { __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } + __inc_zone_state(z, local_stat); #endif } @@ -2674,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, gfp_flags); + zone_statistics(preferred_zone, zone); local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -3531,12 +3523,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; - enum compact_priority compact_priority = DEF_COMPACT_PRIORITY; + enum compact_priority compact_priority; enum compact_result compact_result; - int compaction_retries = 0; - int no_progress_loops = 0; + int compaction_retries; + int no_progress_loops; unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; + unsigned int cpuset_mems_cookie; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3557,6 +3550,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; +retry_cpuset: + compaction_retries = 0; + no_progress_loops = 0; + compact_priority = DEF_COMPACT_PRIORITY; + cpuset_mems_cookie = read_mems_allowed_begin(); + /* + * We need to recalculate the starting point for the zonelist iterator + * because we might have used different nodemask in the fast path, or + * there was a cpuset modification and we are retrying - otherwise we + * could end up iterating over non-eligible zones endlessly. + */ + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); + if (!ac->preferred_zoneref->zone) + goto nopage; + + /* * The fast path uses conservative alloc_flags to succeed only until * kswapd needs to be woken up, and to avoid the cost of setting up @@ -3716,6 +3726,13 @@ retry: &compaction_retries)) goto retry; + /* + * It's possible we raced with cpuset update so the OOM would be + * premature (see below the nopage: label for full explanation). + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry_cpuset; + /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) @@ -3728,6 +3745,16 @@ retry: } nopage: + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + goto retry_cpuset; + warn_alloc(gfp_mask, "page allocation failure: order:%u", order); got_pg: @@ -3742,7 +3769,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { struct page *page; - unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW; gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { @@ -3779,9 +3805,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -3792,8 +3815,13 @@ retry_cpuset: */ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask); - if (!ac.preferred_zoneref) { + if (!ac.preferred_zoneref->zone) { page = NULL; + /* + * This might be due to race with cpuset_current_mems_allowed + * update, so make sure we retry with original nodemask in the + * slow path. + */ goto no_zone; } @@ -3802,6 +3830,7 @@ retry_cpuset: if (likely(page)) goto out; +no_zone: /* * Runtime PM, block IO and its error handling path can deadlock * because I/O on the device might not complete. @@ -3813,21 +3842,10 @@ retry_cpuset: * Restore the original nodemask if it was potentially replaced with * &cpuset_current_mems_allowed to optimize the fast-path attempt. */ - if (cpusets_enabled()) + if (unlikely(ac.nodemask != nodemask)) ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); -no_zone: - /* - * When updating a task's mems_allowed, it is possible to race with - * parallel threads in such a way that an allocation can fail while - * the mask is being updated. If a page allocation is about to fail, - * check if the cpuset changed during allocation and if so, retry. - */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { - alloc_mask = gfp_mask; - goto retry_cpuset; - } + page = __alloc_pages_slowpath(alloc_mask, order, &ac); out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && @@ -3904,8 +3922,8 @@ EXPORT_SYMBOL(free_pages); * drivers to provide a backing region of memory for use as either an * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. */ -static struct page *__page_frag_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) { struct page *page = NULL; gfp_t gfp = gfp_mask; @@ -3925,8 +3943,23 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } -void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void __page_frag_cache_drain(struct page *page, unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) { + unsigned int order = compound_order(page); + + if (order == 0) + free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +} +EXPORT_SYMBOL(__page_frag_cache_drain); + +void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -3934,7 +3967,7 @@ void *__alloc_page_frag(struct page_frag_cache *nc, if (unlikely(!nc->va)) { refill: - page = __page_frag_refill(nc, gfp_mask); + page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; @@ -3977,19 +4010,19 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(__alloc_page_frag); +EXPORT_SYMBOL(page_frag_alloc); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ -void __free_page_frag(void *addr) +void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) __free_pages_ok(page, compound_order(page)); } -EXPORT_SYMBOL(__free_page_frag); +EXPORT_SYMBOL(page_frag_free); static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) @@ -7241,6 +7274,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, + .gfp_mask = GFP_KERNEL, }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_io.c b/mm/page_io.c index a2651f58c86a..23f6d0d3470f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -320,10 +320,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = -ENOMEM; goto out; } - if (wbc->sync_mode == WB_SYNC_ALL) - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC); - else - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/percpu.c b/mm/percpu.c index f696385bcc44..0686f566d347 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -886,7 +886,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); - if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || + !is_power_of_2(align))) { WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; diff --git a/mm/shmem.c b/mm/shmem.c index abd7403aba41..3a7587a0314d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -71,7 +71,7 @@ static struct vfsmount *shm_mnt; #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include "internal.h" @@ -415,6 +415,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { LIST_HEAD(list), *pos, *next; + LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct page *page; @@ -441,9 +442,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* Check if there's anything to gain */ if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_del_init(&info->shrinklist); + list_move(&info->shrinklist, &to_remove); removed++; - iput(inode); goto next; } @@ -454,6 +454,13 @@ next: } spin_unlock(&sbinfo->shrinklist_lock); + list_for_each_safe(pos, next, &to_remove) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + list_del_init(&info->shrinklist); + iput(inode); + } + list_for_each_safe(pos, next, &list) { int ret; @@ -661,8 +668,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -1049,6 +1056,30 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } +static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned int checked = 0; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4096) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + + rcu_read_unlock(); + return found; +} + /* * If swap found in inode, free it and move page from swapcache to filecache. */ @@ -1062,7 +1093,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->page_tree, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -2447,8 +2478,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2517,8 +2548,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -3188,7 +3219,6 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3196,7 +3226,6 @@ static const struct inode_operations shmem_short_symlink_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, diff --git a/mm/slab.c b/mm/slab.c index 87b29e76cafd..4f2ec6bb46eb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -552,12 +552,7 @@ static void start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); - /* - * When this gets called from do_initcalls via cpucache_init(), - * init_workqueues() has already run, so keventd will be setup - * at that time. - */ - if (keventd_up() && reap_work->work.func == NULL) { + if (reap_work->work.func == NULL) { init_reap_node(cpu); INIT_DEFERRABLE_WORK(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, @@ -2462,7 +2457,6 @@ union freelist_init_state { unsigned int pos; unsigned int *list; unsigned int count; - unsigned int rand; }; struct rnd_state rnd_state; }; @@ -2488,8 +2482,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, } else { state->list = cachep->random_seq; state->count = count; - state->pos = 0; - state->rand = rand; + state->pos = rand % count; ret = true; } return ret; @@ -2498,7 +2491,9 @@ static bool freelist_state_initialize(union freelist_init_state *state, /* Get the next entry on the list and randomize it using a random shift */ static freelist_idx_t next_random_slot(union freelist_init_state *state) { - return (state->list[state->pos++] + state->rand) % state->count; + if (state->pos >= state->count) + state->pos = 0; + return state->list[state->pos++]; } /* Swap two freelist entries */ diff --git a/mm/slub.c b/mm/slub.c index 067598a00849..7ec0a965c6a3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -496,10 +496,11 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -static void print_section(char *text, u8 *addr, unsigned int length) +static void print_section(char *level, char *text, u8 *addr, + unsigned int length) { metadata_access_enable(); - print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, length, 1); metadata_access_disable(); } @@ -636,14 +637,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + s->red_left_pad); else if (p > addr + 16) - print_section("Bytes b4 ", p - 16, 16); + print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); - print_section("Object ", p, min_t(unsigned long, s->object_size, - PAGE_SIZE)); + print_section(KERN_ERR, "Object ", p, + min_t(unsigned long, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p + s->object_size, + print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); if (s->offset) @@ -658,7 +660,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, size_from_object(s) - off); + print_section(KERN_ERR, "Padding ", p + off, + size_from_object(s) - off); dump_stack(); } @@ -820,7 +823,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -973,7 +976,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object ", (void *)object, + print_section(KERN_INFO, "Object ", (void *)object, s->object_size); dump_stack(); @@ -1419,6 +1422,10 @@ static int init_cache_random_seq(struct kmem_cache *s) int err; unsigned long i, count = oo_objects(s->oo); + /* Bailout if already initialised */ + if (s->random_seq) + return 0; + err = cache_random_seq_create(s, count, GFP_KERNEL); if (err) { pr_err("SLUB: Unable to initialize free list for %s\n", diff --git a/mm/swap.c b/mm/swap.c index 4dcf852e1e6d..844baedd2429 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -69,6 +69,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(zone_lru_lock(zone), flags); } + __ClearPageWaiters(page); mem_cgroup_uncharge(page); } @@ -784,6 +785,7 @@ void release_pages(struct page **pages, int nr, bool cold) /* Clear Active bit in case of parallel mark_page_accessed */ __ClearPageActive(page); + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1c6e0321205d..4761701d1721 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); - if (count == 1 && !PageWriteback(page)) { + if (count != 1) + goto out; + if (!PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); + } else { + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p->flags & SWP_STABLE_WRITES) { + spin_unlock(&p->lock); + return false; + } + spin_unlock(&p->lock); } } +out: return count <= 1; } @@ -2448,6 +2462,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + + if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + p->flags |= SWP_STABLE_WRITES; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; diff --git a/mm/truncate.c b/mm/truncate.c index fd97f1dbce29..dd7b24e083c5 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -24,20 +24,12 @@ #include <linux/rmap.h> #include "internal.h" -static void clear_exceptional_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) { struct radix_tree_node *node; void **slot; - /* Handled by shmem itself */ - if (shmem_mapping(mapping)) - return; - - if (dax_mapping(mapping)) { - dax_delete_mapping_entry(mapping, index); - return; - } spin_lock_irq(&mapping->tree_lock); /* * Regular page slots are stabilized by the page lock even @@ -55,6 +47,56 @@ unlock: spin_unlock_irq(&mapping->tree_lock); } +/* + * Unconditionally remove exceptional entry. Usually called from truncate path. + */ +static void truncate_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + if (dax_mapping(mapping)) { + dax_delete_mapping_entry(mapping, index); + return; + } + clear_shadow_entry(mapping, index, entry); +} + +/* + * Invalidate exceptional entry if easily possible. This handles exceptional + * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and + * clean entries. + */ +static int invalidate_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return 1; + if (dax_mapping(mapping)) + return dax_invalidate_mapping_entry(mapping, index); + clear_shadow_entry(mapping, index, entry); + return 1; +} + +/* + * Invalidate exceptional entry if clean. This handles exceptional entries for + * invalidate_inode_pages2() so for DAX it evicts only clean entries. + */ +static int invalidate_exceptional_entry2(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return 1; + if (dax_mapping(mapping)) + return dax_invalidate_mapping_entry_sync(mapping, index); + clear_shadow_entry(mapping, index, entry); + return 1; +} + /** * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected @@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping, break; if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + truncate_exceptional_entry(mapping, index, + page); continue; } @@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping, } if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + truncate_exceptional_entry(mapping, index, + page); continue; } @@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, break; if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + invalidate_exceptional_entry(mapping, index, + page); continue; } @@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, break; if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); + if (!invalidate_exceptional_entry2(mapping, + index, page)) + ret = -EBUSY; continue; } diff --git a/mm/util.c b/mm/util.c index 1a41553db866..3cb2164f4099 100644 --- a/mm/util.c +++ b/mm/util.c @@ -13,7 +13,7 @@ #include <linux/vmalloc.h> #include <asm/sections.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a5584384eabc..3ca82d44edd3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -32,7 +32,7 @@ #include <linux/llist.h> #include <linux/bitops.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> diff --git a/mm/vmscan.c b/mm/vmscan.c index 6aa5b01d3e75..532a2a750952 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } +unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], + NR_ZONE_LRU_BASE + lru); +} + /* * Add a shrinker callback to be called from the vm. */ @@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * be complete before mem_cgroup_update_lru_size due to a santity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, - enum lru_list lru, unsigned long *nr_zone_taken, - unsigned long nr_taken) + enum lru_list lru, unsigned long *nr_zone_taken) { int zid; @@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); - } - #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); #endif + } + } /* @@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!managed_zone(zone)) continue; - inactive_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE)); - active_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); + active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); inactive -= min(inactive, inactive_zone); active -= min(active, active_zone); diff --git a/mm/workingset.c b/mm/workingset.c index 241fa5d6b3b2..abb58ffa3c64 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -473,7 +473,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node); + __radix_tree_delete_node(&mapping->page_tree, node, + workingset_update_node, mapping); out_invalid: spin_unlock(&mapping->tree_lock); diff --git a/mm/zswap.c b/mm/zswap.c index 067a0d62f318..cabf09e0128b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -78,7 +78,13 @@ static u64 zswap_duplicate_entry; /* Enable/disable zswap (disabled by default) */ static bool zswap_enabled; -module_param_named(enabled, zswap_enabled, bool, 0644); +static int zswap_enabled_param_set(const char *, + const struct kernel_param *); +static struct kernel_param_ops zswap_enabled_param_ops = { + .set = zswap_enabled_param_set, + .get = param_get_bool, +}; +module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" @@ -176,6 +182,9 @@ static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* used by param callback function */ static bool zswap_init_started; +/* fatal error during init */ +static bool zswap_init_failed; + /********************************* * helpers and fwd declarations **********************************/ @@ -624,6 +633,11 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *s = strstrip((char *)val); int ret; + if (zswap_init_failed) { + pr_err("can't set param, initialization failed\n"); + return -ENODEV; + } + /* no change required */ if (!strcmp(s, *(char **)kp->arg)) return 0; @@ -703,6 +717,17 @@ static int zswap_zpool_param_set(const char *val, return __zswap_param_set(val, kp, NULL, zswap_compressor); } +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + if (zswap_init_failed) { + pr_err("can't enable, initialization failed\n"); + return -ENODEV; + } + + return param_set_bool(val, kp); +} + /********************************* * writeback code **********************************/ @@ -1201,6 +1226,9 @@ hp_fail: dstmem_fail: zswap_entry_cache_destroy(); cache_fail: + /* if built-in, we aren't unloaded on failure; don't allow use */ + zswap_init_failed = true; + zswap_enabled = false; return -ENOMEM; } /* must be late so crypto has time to come up */ |