From 8144458caee8c870696693d70a77893d44d61005 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Feb 2017 10:25:39 +1100
Subject: mm: avoid returning VM_FAULT_RETRY from ->page_mkwrite handlers

Some ->page_mkwrite handlers may return VM_FAULT_RETRY as its return code
(GFS2 or Lustre can definitely do this).  However VM_FAULT_RETRY from
->page_mkwrite is completely unhandled by the mm code and results in
locking and writeably mapping the page which definitely is not what the
caller wanted.  Fix Lustre and block_page_mkwrite_ret() used by other
filesystems (notably GFS2) to return VM_FAULT_NOPAGE instead which results
in bailing out from the fault code, the CPU then retries the access, and
we fault again effectively doing what the handler wanted.

Link: http://lkml.kernel.org/r/20170203150729.15863-1-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/buffer_head.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index d67ab83823ad..79591c3660cc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -243,12 +243,10 @@ static inline int block_page_mkwrite_return(int err)
 {
 	if (err == 0)
 		return VM_FAULT_LOCKED;
-	if (err == -EFAULT)
+	if (err == -EFAULT || err == -EAGAIN)
 		return VM_FAULT_NOPAGE;
 	if (err == -ENOMEM)
 		return VM_FAULT_OOM;
-	if (err == -EAGAIN)
-		return VM_FAULT_RETRY;
 	/* -ENOSPC, -EDQUOT, -EIO ... */
 	return VM_FAULT_SIGBUS;
 }
-- 
cgit v1.2.3


From 2176488c57d760745d3c6e285b88ecd83acc453b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:25:39 +1100
Subject: cpumask: use nr_cpumask_bits for parsing functions

513e3d2d11c9 ("cpumask: always use nr_cpu_ids in formatting and parsing
functions") converted both cpumask printing and parsing functions to use
nr_cpu_ids instead of nr_cpumask_bits.  While this was okay for the
printing functions as it just picked one of the two output formats that we
were alternating between depending on a kernel config, doing the same for
parsing wasn't okay.

nr_cpumask_bits can be either nr_cpu_ids or NR_CPUS.  We can always use
nr_cpu_ids but that is a variable while NR_CPUS is a constant, so it can
be more efficient to use NR_CPUS when we can get away with it.  Converting
the printing functions to nr_cpu_ids makes sense because it affects how
the masks get presented to userspace and doesn't break anything; however,
using nr_cpu_ids for parsing functions can incorrectly leave the higher
bits uninitialized while reading in these masks from userland.  As all
testing and comparison functions use nr_cpumask_bits which can be larger
than nr_cpu_ids, the parsed cpumasks can erroneously yield false negative
results.

This made the taskstats interface incorrectly return -EINVAL even when the
inputs were correct.

Fix it by restoring the parse functions to use nr_cpumask_bits instead of
nr_cpu_ids.

Link: http://lkml.kernel.org/r/20170206182442.GB31078@htj.duckdns.org
Fixes: 513e3d2d11c9 ("cpumask: always use nr_cpu_ids in formatting and parsing functions")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Martin Steigerwald <martin.steigerwald@teamix.de>
Debugged-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Cc: <stable@vger.kernel.org>	[4.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c717f5ea88cb..b3d2c1a89ac4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -560,7 +560,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
-	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpu_ids);
+	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
 }
 
 /**
@@ -575,7 +575,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
 	return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
-				     nr_cpu_ids);
+				     nr_cpumask_bits);
 }
 
 /**
@@ -590,7 +590,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 	char *nl = strchr(buf, '\n');
 	unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf);
 
-	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpu_ids);
+	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
 }
 
 /**
@@ -602,7 +602,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
-	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpu_ids);
+	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
 }
 
 /**
-- 
cgit v1.2.3


From 7768b579d4408893e72fdbb07c83c314f1fb3646 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 8 Feb 2017 10:25:42 +1100
Subject: tracing: add __print_flags_u64()

Patch series "DAX tracepoints, mm argument simplification", v4.

This contains both my DAX tracepoint code and Dave Jiang's MM argument
simplifications.  Dave's code was written with my tracepoint code as a
baseline, so it seemed simplest to keep them together in a single series.

This patch (of 7):

Add __print_flags_u64() and the helper trace_print_flags_seq_u64() in the
same spirit as __print_symbolic_u64() and trace_print_symbols_seq_u64().
These functions allow us to print symbols associated with flags that are
64 bits wide even on 32 bit machines.

These will be used by the DAX code so that we can print the flags set in a
pfn_t such as PFN_SG_CHAIN, PFN_SG_LAST, PFN_DEV and PFN_MAP.

Without this new function I was getting errors like the following when
compiling for i386:

./include/linux/pfn_t.h:13:22: warning: large integer implicitly truncated
to unsigned type [-Woverflow]
 #define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
  ^

Link: http://lkml.kernel.org/r/1484085142-2297-2-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/trace_events.h |  4 ++++
 include/trace/trace_events.h | 11 +++++++++++
 kernel/trace/trace_output.c  | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index be007610ceb0..db2c3ba44500 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -23,6 +23,10 @@ const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 				    const struct trace_print_flags *symbol_array);
 
 #if BITS_PER_LONG == 32
+const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+		      unsigned long long flags,
+		      const struct trace_print_flags_u64 *flag_array);
+
 const char *trace_print_symbols_seq_u64(struct trace_seq *p,
 					unsigned long long val,
 					const struct trace_print_flags_u64
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 467e12f780d8..c6e9f7231f82 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -283,8 +283,16 @@ TRACE_MAKE_SYSTEM_STR();
 		trace_print_symbols_seq(p, value, symbols);		\
 	})
 
+#undef __print_flags_u64
 #undef __print_symbolic_u64
 #if BITS_PER_LONG == 32
+#define __print_flags_u64(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags_u64 __flags[] =	\
+			{ flag_array, { -1, NULL } };			\
+		trace_print_flags_seq_u64(p, delim, flag, __flags);	\
+	})
+
 #define __print_symbolic_u64(value, symbol_array...)			\
 	({								\
 		static const struct trace_print_flags_u64 symbols[] =	\
@@ -292,6 +300,9 @@ TRACE_MAKE_SYSTEM_STR();
 		trace_print_symbols_seq_u64(p, value, symbols);	\
 	})
 #else
+#define __print_flags_u64(flag, delim, flag_array...)			\
+			__print_flags(flag, delim, flag_array)
+
 #define __print_symbolic_u64(value, symbol_array...)			\
 			__print_symbolic(value, symbol_array)
 #endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5d33a7352919..5204485dbe17 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -123,6 +123,44 @@ trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 EXPORT_SYMBOL(trace_print_symbols_seq);
 
 #if BITS_PER_LONG == 32
+const char *
+trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+		      unsigned long long flags,
+		      const struct trace_print_flags_u64 *flag_array)
+{
+	unsigned long long mask;
+	const char *str;
+	const char *ret = trace_seq_buffer_ptr(p);
+	int i, first = 1;
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (!first && delim)
+			trace_seq_puts(p, delim);
+		else
+			first = 0;
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (!first && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%llx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(trace_print_flags_seq_u64);
+
 const char *
 trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 			 const struct trace_print_flags_u64 *symbol_array)
-- 
cgit v1.2.3


From f03743b00308f6e081cc2991a886442a3f716580 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 8 Feb 2017 10:25:42 +1100
Subject: dax: add tracepoint infrastructure, PMD tracing

Tracepoints are the standard way to capture debugging and tracing
information in many parts of the kernel, including the XFS and ext4
filesystems.  Create a tracepoint header for FS DAX and add the first DAX
tracepoints to the PMD fault handler.  This allows the tracing for DAX to
be done in the same way as the filesystem tracing so that developers can
look at them together and get a coherent idea of what the system is doing.

I added both an entry and exit tracepoint because future patches will add
tracepoints to child functions of dax_iomap_pmd_fault() like
dax_pmd_load_hole() and dax_pmd_insert_mapping().  We want those messages
to be wrapped by the parent function tracepoints so the code flow is more
easily understood.  Having entry and exit tracepoints for faults also
allows us to easily see what filesystems functions were called during the
fault.  These filesystem functions get executed via iomap_begin() and
iomap_end() calls, for example, and will have their own tracepoints.

For PMD faults we primarily want to understand the type of mapping, the
fault flags, the faulting address and whether it fell back to 4k faults.
If it fell back to 4k faults the tracepoints should let us understand why.

I named the new tracepoint header file "fs_dax.h" to allow for device DAX
to have its own separate tracing header in the same directory at some
point.

Here is an example output for these events from a successful PMD fault:

big-1441  [005] ....    32.582758: xfs_filemap_pmd_fault: dev 259:0 ino
0x1003

big-1441  [005] ....    32.582776: dax_pmd_fault: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start
0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400

big-1441  [005] ....    32.583292: dax_pmd_fault_done: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start
0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE

Link: http://lkml.kernel.org/r/1484085142-2297-3-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 30 ++++++++++++-------
 include/linux/mm.h            | 25 ++++++++++++++++
 include/trace/events/fs_dax.h | 68 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 10 deletions(-)
 create mode 100644 include/trace/events/fs_dax.h

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index c45598b912e1..93b99800fc56 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,6 +35,9 @@
 #include <linux/iomap.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -1337,6 +1340,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	loff_t pos;
 	int error;
 
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is
+	 * supposed to hold locks serializing us with truncate / punch hole so
+	 * this is a reliable test.
+	 */
+	pgoff = linear_page_index(vma, pmd_addr);
+	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+	trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
+
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
 		goto fallback;
@@ -1347,16 +1360,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	/*
-	 * Check whether offset isn't beyond end of file now. Caller is
-	 * supposed to hold locks serializing us with truncate / punch hole so
-	 * this is a reliable test.
-	 */
-	pgoff = linear_page_index(vma, pmd_addr);
-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
-
-	if (pgoff > max_pgoff)
-		return VM_FAULT_SIGBUS;
+	if (pgoff > max_pgoff) {
+		result = VM_FAULT_SIGBUS;
+		goto out;
+	}
 
 	/* If the PMD would extend beyond the file size */
 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
@@ -1428,6 +1435,9 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		split_huge_pmd(vma, pmd, address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
+out:
+	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
+			result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b84615b0f64c..05b56ebb1bfd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -281,6 +281,17 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
+#define FAULT_FLAG_TRACE \
+	{ FAULT_FLAG_WRITE,		"WRITE" }, \
+	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
+	{ FAULT_FLAG_ALLOW_RETRY,	"ALLOW_RETRY" }, \
+	{ FAULT_FLAG_RETRY_NOWAIT,	"RETRY_NOWAIT" }, \
+	{ FAULT_FLAG_KILLABLE,		"KILLABLE" }, \
+	{ FAULT_FLAG_TRIED,		"TRIED" }, \
+	{ FAULT_FLAG_USER,		"USER" }, \
+	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
+	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }
+
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
@@ -1107,6 +1118,20 @@ static inline void clear_page_pfmemalloc(struct page *page)
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
 			 VM_FAULT_FALLBACK)
 
+#define VM_FAULT_RESULT_TRACE \
+	{ VM_FAULT_OOM,			"OOM" }, \
+	{ VM_FAULT_SIGBUS,		"SIGBUS" }, \
+	{ VM_FAULT_MAJOR,		"MAJOR" }, \
+	{ VM_FAULT_WRITE,		"WRITE" }, \
+	{ VM_FAULT_HWPOISON,		"HWPOISON" }, \
+	{ VM_FAULT_HWPOISON_LARGE,	"HWPOISON_LARGE" }, \
+	{ VM_FAULT_SIGSEGV,		"SIGSEGV" }, \
+	{ VM_FAULT_NOPAGE,		"NOPAGE" }, \
+	{ VM_FAULT_LOCKED,		"LOCKED" }, \
+	{ VM_FAULT_RETRY,		"RETRY" }, \
+	{ VM_FAULT_FALLBACK,		"FALLBACK" }, \
+	{ VM_FAULT_DONE_COW,		"DONE_COW" }
+
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
new file mode 100644
index 000000000000..58b0b5612683
--- /dev/null
+++ b/include/trace/events/fs_dax.h
@@ -0,0 +1,68 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs_dax
+
+#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_DAX_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(dax_pmd_fault_class,
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
+		unsigned long address, unsigned int flags, pgoff_t pgoff,
+		pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_start)
+		__field(unsigned long, vm_end)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(pgoff_t, pgoff)
+		__field(pgoff_t, max_pgoff)
+		__field(dev_t, dev)
+		__field(unsigned int, flags)
+		__field(int, result)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_start = vma->vm_start;
+		__entry->vm_end = vma->vm_end;
+		__entry->vm_flags = vma->vm_flags;
+		__entry->address = address;
+		__entry->flags = flags;
+		__entry->pgoff = pgoff;
+		__entry->max_pgoff = max_pgoff;
+		__entry->result = result;
+	),
+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start "
+			"%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
+		__entry->address,
+		__entry->vm_start,
+		__entry->vm_end,
+		__entry->pgoff,
+		__entry->max_pgoff,
+		__print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
+	)
+)
+
+#define DEFINE_PMD_FAULT_EVENT(name) \
+DEFINE_EVENT(dax_pmd_fault_class, name, \
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
+		unsigned long address, unsigned int flags, pgoff_t pgoff, \
+		pgoff_t max_pgoff, int result), \
+	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
+
+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
+
+
+#endif /* _TRACE_FS_DAX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From b9b929fcfaf6add30172a2d06298ed3e3be40441 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 8 Feb 2017 10:25:43 +1100
Subject: dax: add tracepoints to dax_pmd_load_hole()

Add tracepoints to dax_pmd_load_hole(), following the same logging
conventions as the tracepoints in dax_iomap_pmd_fault().

Here is an example PMD fault showing the new tracepoints:

read_big-1478  [004] ....   238.242188: xfs_filemap_pmd_fault: dev 259:0
ino 0x1003

read_big-1478  [004] ....   238.242191: dax_pmd_fault: dev 259:0 ino 0x1003
shared ALLOW_RETRY|KILLABLE|USER address 0x10400000 vm_start 0x10200000
vm_end 0x10600000 pgoff 0x200 max_pgoff 0x1400

read_big-1478  [004] ....   238.242390: dax_pmd_load_hole: dev 259:0 ino
0x1003 shared address 0x10400000 zero_page ffffea0002c20000 radix_entry
0x1e

read_big-1478  [004] ....   238.242392: dax_pmd_fault_done: dev 259:0 ino
0x1003 shared ALLOW_RETRY|KILLABLE|USER address 0x10400000 vm_start
0x10200000 vm_end 0x10600000 pgoff 0x200 max_pgoff 0x1400 NOPAGE

Link: http://lkml.kernel.org/r/1484085142-2297-5-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 14 ++++++++++----
 include/trace/events/fs_dax.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 93b99800fc56..c7bea19803f2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1295,33 +1295,39 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = address & PMD_MASK;
+	struct inode *inode = mapping->host;
 	struct page *zero_page;
+	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
-	void *ret;
 
 	zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
 	if (unlikely(!zero_page))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 
 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
 			RADIX_DAX_PMD | RADIX_DAX_HZP);
 	if (IS_ERR(ret))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	*entryp = ret;
 
 	ptl = pmd_lock(vma->vm_mm, pmd);
 	if (!pmd_none(*pmd)) {
 		spin_unlock(ptl);
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	}
 
 	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
 	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
 	spin_unlock(ptl);
+	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
 	return VM_FAULT_NOPAGE;
+
+fallback:
+	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
+	return VM_FAULT_FALLBACK;
 }
 
 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 58b0b5612683..43f1263131a4 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -61,6 +61,48 @@ DEFINE_EVENT(dax_pmd_fault_class, name, \
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
 
+DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
+		unsigned long address, struct page *zero_page,
+		void *radix_entry),
+	TP_ARGS(inode, vma, address, zero_page, radix_entry),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(struct page *, zero_page)
+		__field(void *, radix_entry)
+		__field(dev_t, dev)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_flags = vma->vm_flags;
+		__entry->address = address;
+		__entry->zero_page = zero_page;
+		__entry->radix_entry = radix_entry;
+	),
+	TP_printk("dev %d:%d ino %#lx %s address %#lx zero_page %p "
+			"radix_entry %#lx",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__entry->address,
+		__entry->zero_page,
+		(unsigned long)__entry->radix_entry
+	)
+)
+
+#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
+DEFINE_EVENT(dax_pmd_load_hole_class, name, \
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
+		unsigned long address, struct page *zero_page, \
+		void *radix_entry), \
+	TP_ARGS(inode, vma, address, zero_page, radix_entry))
+
+DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
+DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
 #endif /* _TRACE_FS_DAX_H */
 
-- 
cgit v1.2.3


From 5770b2ed95692e82bd7e1b5f73d95781c3ed463b Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 8 Feb 2017 10:25:43 +1100
Subject: dax: add tracepoints to dax_pmd_insert_mapping()

Add tracepoints to dax_pmd_insert_mapping(), following the same logging
conventions as the tracepoints in dax_iomap_pmd_fault().

Here is an example PMD fault showing the new tracepoints:

big-1504  [001] ....   326.960743: xfs_filemap_pmd_fault: dev 259:0 ino
0x1003

big-1504  [001] ....   326.960753: dax_pmd_fault: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start
0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400

big-1504  [001] ....   326.960981: dax_pmd_insert_mapping: dev 259:0 ino
0x1003 shared write address 0x10505000 length 0x200000 pfn 0x100600 DEV|MAP
radix_entry 0xc000e

big-1504  [001] ....   326.960986: dax_pmd_fault_done: dev 259:0 ino 0x1003
shared WRITE|ALLOW_RETRY|KILLABLE|USER address 0x10505000 vm_start
0x10200000 vm_end 0x10700000 pgoff 0x200 max_pgoff 0x1400 NOPAGE

Link: http://lkml.kernel.org/r/1484085142-2297-6-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 12 +++++++---
 include/linux/pfn_t.h         |  6 +++++
 include/trace/events/fs_dax.h | 51 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index c7bea19803f2..88e46eb2ff4a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1258,15 +1258,16 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
+	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
 		.sector = dax_iomap_sector(iomap, pos),
 		.size = PMD_SIZE,
 	};
 	long length = dax_map_atomic(bdev, &dax);
-	void *ret;
+	void *ret = NULL;
 
 	if (length < 0) /* dax_map_atomic() failed */
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	if (length < PMD_SIZE)
 		goto unmap_fallback;
 	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
@@ -1279,13 +1280,18 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	*entryp = ret;
 
+	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
+			dax.pfn, ret);
 	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
+fallback:
+	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
+			length, dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index a3d90b9da18d..033fc7bbcefa 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -15,6 +15,12 @@
 #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
 #define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
 
+#define PFN_FLAGS_TRACE \
+	{ PFN_SG_CHAIN,	"SG_CHAIN" }, \
+	{ PFN_SG_LAST,	"SG_LAST" }, \
+	{ PFN_DEV,	"DEV" }, \
+	{ PFN_MAP,	"MAP" }
+
 static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
 {
 	pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 43f1263131a4..c3b0aae216dc 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -104,6 +104,57 @@ DEFINE_EVENT(dax_pmd_load_hole_class, name, \
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
+DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
+		unsigned long address, int write, long length, pfn_t pfn,
+		void *radix_entry),
+	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long, vm_flags)
+		__field(unsigned long, address)
+		__field(long, length)
+		__field(u64, pfn_val)
+		__field(void *, radix_entry)
+		__field(dev_t, dev)
+		__field(int, write)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->vm_flags = vma->vm_flags;
+		__entry->address = address;
+		__entry->write = write;
+		__entry->length = length;
+		__entry->pfn_val = pfn.val;
+		__entry->radix_entry = radix_entry;
+	),
+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx "
+			"pfn %#llx %s radix_entry %#lx",
+		MAJOR(__entry->dev),
+		MINOR(__entry->dev),
+		__entry->ino,
+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
+		__entry->write ? "write" : "read",
+		__entry->address,
+		__entry->length,
+		__entry->pfn_val & ~PFN_FLAGS_MASK,
+		__print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|",
+			PFN_FLAGS_TRACE),
+		(unsigned long)__entry->radix_entry
+	)
+)
+
+#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
+DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
+	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
+		unsigned long address, int write, long length, pfn_t pfn, \
+		void *radix_entry), \
+	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
+
+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
+
 #endif /* _TRACE_FS_DAX_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From d090a0cea1969f1ed337b640657a69e51a68be23 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Feb 2017 10:25:44 +1100
Subject: mm, dax: make pmd_fault() and friends be the same as fault()

Instead of passing in multiple parameters in the pmd_fault() handler,
a vmf can be passed in just like a fault() handler. This will simplify
code and remove the need for the actual pmd fault handlers to allocate a
vmf. Related functions are also modified to do the same.

Link: http://lkml.kernel.org/r/1484085142-2297-7-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/dax.c             | 16 +++++++---------
 fs/dax.c                      | 42 ++++++++++++++++++------------------------
 fs/ext4/file.c                |  9 ++++-----
 fs/xfs/xfs_file.c             | 10 ++++------
 include/linux/dax.h           |  7 +++----
 include/linux/mm.h            |  3 +--
 include/trace/events/fs_dax.h | 15 +++++++--------
 mm/memory.c                   |  6 ++----
 8 files changed, 46 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index ed758b74ddf0..a8833cc35697 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -473,10 +473,9 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
-		unsigned int flags)
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	unsigned long pmd_addr = addr & PMD_MASK;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
 	phys_addr_t phys;
@@ -508,23 +507,22 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
-			flags & FAULT_FLAG_WRITE);
+	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-			current->comm, (flags & FAULT_FLAG_WRITE)
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 88e46eb2ff4a..b99362c46984 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1336,18 +1336,17 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
-	pgoff_t max_pgoff, pgoff;
-	struct vm_fault vmf;
+	pgoff_t max_pgoff;
 	void *entry;
 	loff_t pos;
 	int error;
@@ -1357,10 +1356,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * supposed to hold locks serializing us with truncate / punch hole so
 	 * this is a reliable test.
 	 */
-	pgoff = linear_page_index(vma, pmd_addr);
+	vmf->pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1372,13 +1371,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	if (pgoff > max_pgoff) {
+	if (vmf->pgoff > max_pgoff) {
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
 	/* If the PMD would extend beyond the file size */
-	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+	if ((vmf->pgoff | PG_PMD_COLOUR) > max_pgoff)
 		goto fallback;
 
 	/*
@@ -1386,7 +1385,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * setting up a mapping, so really we're using iomap_begin() as a way
 	 * to look up our filesystem block.
 	 */
-	pos = (loff_t)pgoff << PAGE_SHIFT;
+	pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
 	if (error)
 		goto fallback;
@@ -1400,25 +1399,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	 * the tree, for instance), it will return -EEXIST and we just fall
 	 * back to 4k entries.
 	 */
-	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+	entry = grab_mapping_entry(mapping, vmf->pgoff, RADIX_DAX_PMD);
 	if (IS_ERR(entry))
 		goto finish_iomap;
 
-	vmf.pgoff = pgoff;
-	vmf.flags = flags;
-	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
-
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
-				&iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
+				vmf->address, &iomap, pos, write, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
-				&entry);
+		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
+				&iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1426,7 +1421,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 
  unlock_entry:
-	put_locked_mapping_entry(mapping, pgoff, entry);
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  finish_iomap:
 	if (ops->iomap_end) {
 		int copied = PMD_SIZE;
@@ -1444,12 +1439,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
-		split_huge_pmd(vma, pmd, address);
+		split_huge_pmd(vma, vmf->pmd, vmf->address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, address, flags, pgoff, max_pgoff,
-			result);
+	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d663d3d7c81c..10b64ba29cd2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -275,21 +275,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return result;
 }
 
-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
+static int
+ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	bool write = flags & FAULT_FLAG_WRITE;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
-				     &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bbb9eb6811b2..44a8d2356e31 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1432,9 +1432,7 @@ xfs_filemap_fault(
 STATIC int
 xfs_filemap_pmd_fault(
 	struct vm_area_struct	*vma,
-	unsigned long		addr,
-	pmd_t			*pmd,
-	unsigned int		flags)
+	struct vm_fault		*vmf)
 {
 	struct inode		*inode = file_inode(vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1445,16 +1443,16 @@ xfs_filemap_pmd_fault(
 
 	trace_xfs_filemap_pmd_fault(ip);
 
-	if (flags & FAULT_FLAG_WRITE) {
+	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
 		file_update_time(vma->vm_file);
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-	if (flags & FAULT_FLAG_WRITE)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 24ad71173995..a829fee2b42b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,16 +71,15 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
 static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd, unsigned int flags,
-		struct iomap_ops *ops)
+		struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 05b56ebb1bfd..0e46928b66b4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -347,8 +347,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
-						pmd_t *, unsigned int flags);
+	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index c3b0aae216dc..a98665bfb38f 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -8,9 +8,8 @@
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags, pgoff_t pgoff,
-		pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result),
+		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vma, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -29,9 +28,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 		__entry->vm_start = vma->vm_start;
 		__entry->vm_end = vma->vm_end;
 		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->flags = flags;
-		__entry->pgoff = pgoff;
+		__entry->address = vmf->address;
+		__entry->flags = vmf->flags;
+		__entry->pgoff = vmf->pgoff;
 		__entry->max_pgoff = max_pgoff;
 		__entry->result = result;
 	),
@@ -54,9 +53,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
 	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, unsigned int flags, pgoff_t pgoff, \
+		struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, address, flags, pgoff, max_pgoff, result))
+	TP_ARGS(inode, vma, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
diff --git a/mm/memory.c b/mm/memory.c
index 6bf2b471e30c..2376f8528800 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3475,8 +3475,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
-				vmf->flags);
+		return vma->vm_ops->pmd_fault(vma, vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3485,8 +3484,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
-						   vmf->pmd, vmf->flags);
+		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
-- 
cgit v1.2.3


From 8e1deac21d8494e7f076abeea5cff6c1b84f9ff4 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Feb 2017 10:25:45 +1100
Subject: mm, dax: change pmd_fault() to take only vmf parameter

pmd_fault() and related functions really only need the vmf parameter since
the additional parameters are all included in the vmf struct.  Remove the
additional parameter and simplify pmd_fault() and friends.

Link: http://lkml.kernel.org/r/1484085142-2297-8-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/dax.c             | 18 +++++++--------
 fs/dax.c                      | 54 ++++++++++++++++++++-----------------------
 fs/ext4/file.c                |  8 +++----
 fs/xfs/xfs_file.c             |  7 +++---
 include/linux/dax.h           |  7 +++---
 include/linux/mm.h            |  2 +-
 include/trace/events/fs_dax.h | 54 ++++++++++++++++++++-----------------------
 mm/memory.c                   |  9 ++++----
 8 files changed, 74 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index a8833cc35697..18e9875f6277 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -472,8 +472,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return rc;
 }
 
-static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
-		struct vm_area_struct *vma, struct vm_fault *vmf)
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct device *dev = &dax_dev->dev;
@@ -482,7 +481,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 	pgoff_t pgoff;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -497,7 +496,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pgoff = linear_page_index(vma, pmd_addr);
+	pgoff = linear_page_index(vmf->vma, pmd_addr);
 	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
 	if (phys == -1) {
 		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
@@ -507,22 +506,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_pmd_fault(struct vm_fault *vmf)
 {
 	int rc;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
 
 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read", vma->vm_start, vma->vm_end);
+			? "write" : "read",
+			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_pmd_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/fs/dax.c b/fs/dax.c
index 18192dbebc10..26c2dc3aede0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1252,11 +1252,10 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  */
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+		loff_t pos, void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
@@ -1283,31 +1282,30 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vma, address, write, length,
-			dax.pfn, ret);
-	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vma, address, write,
-			length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
+			dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, void **entryp)
+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+		void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	struct inode *inode = mapping->host;
 	struct page *zero_page;
 	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
 
-	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
 
 	if (unlikely(!zero_page))
 		goto fallback;
@@ -1318,27 +1316,27 @@ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
 		goto fallback;
 	*entryp = ret;
 
-	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (!pmd_none(*pmd)) {
+	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+	if (!pmd_none(*(vmf->pmd))) {
 		spin_unlock(ptl);
 		goto fallback;
 	}
 
-	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
-	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
 	return VM_FAULT_NOPAGE;
 
 fallback:
-	trace_dax_pmd_load_hole_fallback(inode, vma, address, zero_page, ret);
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1359,7 +1357,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	pgoff = linear_page_index(vma, pmd_addr);
 	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-	trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
+	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
@@ -1405,15 +1403,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, vmf->pmd, vmf,
-				vmf->address, &iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			goto unlock_entry;
-		result = dax_pmd_load_hole(vma, vmf->pmd, vmf, vmf->address,
-				&iomap, &entry);
+		result = dax_pmd_load_hole(vmf, &iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1443,7 +1439,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
 out:
-	trace_dax_pmd_fault_done(inode, vma, vmf, max_pgoff, result);
+	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 10b64ba29cd2..75dc3dd531d7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -276,19 +276,19 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 
 static int
-ext4_dax_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ext4_dax_pmd_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 44a8d2356e31..9d8440b07b53 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1431,10 +1431,9 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_pmd_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
 
@@ -1445,11 +1444,11 @@ xfs_filemap_pmd_fault(
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, vmf, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a829fee2b42b..c1bd6ab5e974 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,15 +71,14 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
-		struct vm_fault *vmf, struct iomap_ops *ops)
+static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
+		struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e46928b66b4..de045a48137c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -347,7 +347,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index a98665bfb38f..c566ddc87f73 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -7,9 +7,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_EVENT_CLASS(dax_pmd_fault_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		struct vm_fault *vmf, pgoff_t max_pgoff, int result),
-	TP_ARGS(inode, vma, vmf, max_pgoff, result),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		pgoff_t max_pgoff, int result),
+	TP_ARGS(inode, vmf, max_pgoff, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
@@ -25,9 +25,9 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_start = vma->vm_start;
-		__entry->vm_end = vma->vm_end;
-		__entry->vm_flags = vma->vm_flags;
+		__entry->vm_start = vmf->vma->vm_start;
+		__entry->vm_end = vmf->vma->vm_end;
+		__entry->vm_flags = vmf->vma->vm_flags;
 		__entry->address = vmf->address;
 		__entry->flags = vmf->flags;
 		__entry->pgoff = vmf->pgoff;
@@ -52,19 +52,18 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 
 #define DEFINE_PMD_FAULT_EVENT(name) \
 DEFINE_EVENT(dax_pmd_fault_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		struct vm_fault *vmf, \
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
 		pgoff_t max_pgoff, int result), \
-	TP_ARGS(inode, vma, vmf, max_pgoff, result))
+	TP_ARGS(inode, vmf, max_pgoff, result))
 
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
 DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
 
 DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, struct page *zero_page,
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		struct page *zero_page,
 		void *radix_entry),
-	TP_ARGS(inode, vma, address, zero_page, radix_entry),
+	TP_ARGS(inode, vmf, zero_page, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -76,8 +75,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
 		__entry->zero_page = zero_page;
 		__entry->radix_entry = radix_entry;
 	),
@@ -95,19 +94,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 
 #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
 DEFINE_EVENT(dax_pmd_load_hole_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, struct page *zero_page, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, zero_page, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		struct page *zero_page, void *radix_entry), \
+	TP_ARGS(inode, vmf, zero_page, radix_entry))
 
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
 
 DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma,
-		unsigned long address, int write, long length, pfn_t pfn,
-		void *radix_entry),
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry),
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
+		long length, pfn_t pfn, void *radix_entry),
+	TP_ARGS(inode, vmf, length, pfn, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
@@ -121,9 +118,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
-		__entry->vm_flags = vma->vm_flags;
-		__entry->address = address;
-		__entry->write = write;
+		__entry->vm_flags = vmf->vma->vm_flags;
+		__entry->address = vmf->address;
+		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
 		__entry->length = length;
 		__entry->pfn_val = pfn.val;
 		__entry->radix_entry = radix_entry;
@@ -146,10 +143,9 @@ DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
 
 #define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
 DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
-	TP_PROTO(struct inode *inode, struct vm_area_struct *vma, \
-		unsigned long address, int write, long length, pfn_t pfn, \
-		void *radix_entry), \
-	TP_ARGS(inode, vma, address, write, length, pfn, radix_entry))
+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
+		long length, pfn_t pfn, void *radix_entry), \
+	TP_ARGS(inode, vmf, length, pfn, radix_entry))
 
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
diff --git a/mm/memory.c b/mm/memory.c
index 2376f8528800..ececdc4a2892 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3471,11 +3471,10 @@ out:
 
 static int create_huge_pmd(struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	if (vma_is_anonymous(vma))
+	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
-	if (vma->vm_ops->pmd_fault)
-		return vma->vm_ops->pmd_fault(vma, vmf);
+	if (vmf->vma->vm_ops->pmd_fault)
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3484,7 +3483,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf);
+		return vmf->vma->vm_ops->pmd_fault(vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
-- 
cgit v1.2.3


From 0180ba75f59c3770bb68abf230652fd4ba6f9421 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:25:57 +1100
Subject: slub: separate out sysfs_slab_release() from sysfs_slab_remove()

Separate out slub sysfs removal and release, and call the former earlier
from __kmem_cache_shutdown().  There's no reason to defer sysfs removal
through RCU and this will later allow us to remove sysfs files way earlier
during memory cgroup offline instead of release.

Link: http://lkml.kernel.org/r/20170117235411.9408-3-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slub_def.h |  4 ++--
 mm/slab_common.c         |  2 +-
 mm/slub.c                | 12 ++++++++++--
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 75f56c2ef2d4..07ef550c6627 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -113,9 +113,9 @@ struct kmem_cache {
 
 #ifdef CONFIG_SYSFS
 #define SLAB_SUPPORTS_SYSFS
-void sysfs_slab_remove(struct kmem_cache *);
+void sysfs_slab_release(struct kmem_cache *);
 #else
-static inline void sysfs_slab_remove(struct kmem_cache *s)
+static inline void sysfs_slab_release(struct kmem_cache *s)
 {
 }
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3ff312dfef31..fc13a619b538 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -483,7 +483,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
 
 	list_for_each_entry_safe(s, s2, release, list) {
 #ifdef SLAB_SUPPORTS_SYSFS
-		sysfs_slab_remove(s);
+		sysfs_slab_release(s);
 #else
 		slab_kmem_cache_release(s);
 #endif
diff --git a/mm/slub.c b/mm/slub.c
index 6de08005d9cd..caac5456f0ec 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -214,11 +214,13 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void memcg_propagate_slab_attrs(struct kmem_cache *s);
+static void sysfs_slab_remove(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
 #endif
 
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -3687,6 +3689,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
+	sysfs_slab_remove(s);
 	return 0;
 }
 
@@ -5637,7 +5640,7 @@ out_del_kobj:
 	goto out;
 }
 
-void sysfs_slab_remove(struct kmem_cache *s)
+static void sysfs_slab_remove(struct kmem_cache *s)
 {
 	if (slab_state < FULL)
 		/*
@@ -5651,7 +5654,12 @@ void sysfs_slab_remove(struct kmem_cache *s)
 #endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
 	kobject_del(&s->kobj);
-	kobject_put(&s->kobj);
+}
+
+void sysfs_slab_release(struct kmem_cache *s)
+{
+	if (slab_state >= FULL)
+		kobject_put(&s->kobj);
 }
 
 /*
-- 
cgit v1.2.3


From 7d348a9f480aec00cafee539592dd6ab0dbe1028 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:25:58 +1100
Subject: slab: reorganize memcg_cache_params

We're going to change how memcg caches are iterated.  In preparation,
clean up and reorganize memcg_cache_params.

* The shared ->list is replaced by ->children in root and
  ->children_node in children.

* ->is_root_cache is removed.  Instead ->root_cache is moved out of
  the child union and now used by both root and children.  NULL
  indicates root cache.  Non-NULL a memcg one.

This patch doesn't cause any observable behavior changes.

Link: http://lkml.kernel.org/r/20170117235411.9408-5-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h | 33 ++++++++++++++++++++++++---------
 mm/slab.h            |  6 +++---
 mm/slab_common.c     | 25 +++++++++++++------------
 3 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4c5363566815..1f611ba00f1d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -545,22 +545,37 @@ struct memcg_cache_array {
  * array to be accessed without taking any locks, on relocation we free the old
  * version only after a grace period.
  *
- * Child caches will hold extra metadata needed for its operation. Fields are:
+ * Root and child caches hold different metadata.
  *
- * @memcg: pointer to the memcg this cache belongs to
- * @root_cache: pointer to the global, root cache, this cache was derived from
+ * @root_cache:	Common to root and child caches.  NULL for root, pointer to
+ *		the root cache for children.
  *
- * Both root and child caches of the same kind are linked into a list chained
- * through @list.
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
+ *		used to index child cachces during allocation and cleared
+ *		early during shutdown.
+ *
+ * @children:	List of all child caches.  While the child caches are also
+ *		reachable through @memcg_caches, a child cache remains on
+ *		this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg:	Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
  */
 struct memcg_cache_params {
-	bool is_root_cache;
-	struct list_head list;
+	struct kmem_cache *root_cache;
 	union {
-		struct memcg_cache_array __rcu *memcg_caches;
+		struct {
+			struct memcg_cache_array __rcu *memcg_caches;
+			struct list_head children;
+		};
 		struct {
 			struct mem_cgroup *memcg;
-			struct kmem_cache *root_cache;
+			struct list_head children_node;
 		};
 	};
 };
diff --git a/mm/slab.h b/mm/slab.h
index 2284a92dbfa6..d50f919c2e8c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -201,12 +201,12 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
  * slab_mutex.
  */
 #define for_each_memcg_cache(iter, root) \
-	list_for_each_entry(iter, &(root)->memcg_params.list, \
-			    memcg_params.list)
+	list_for_each_entry(iter, &(root)->memcg_params.children, \
+			    memcg_params.children_node)
 
 static inline bool is_root_cache(struct kmem_cache *s)
 {
-	return s->memcg_params.is_root_cache;
+	return !s->memcg_params.root_cache;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2a6e060c10ed..40192d153cdc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -140,9 +140,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 void slab_init_memcg_params(struct kmem_cache *s)
 {
-	s->memcg_params.is_root_cache = true;
-	INIT_LIST_HEAD(&s->memcg_params.list);
+	s->memcg_params.root_cache = NULL;
 	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+	INIT_LIST_HEAD(&s->memcg_params.children);
 }
 
 static int init_memcg_params(struct kmem_cache *s,
@@ -150,10 +150,10 @@ static int init_memcg_params(struct kmem_cache *s,
 {
 	struct memcg_cache_array *arr;
 
-	if (memcg) {
-		s->memcg_params.is_root_cache = false;
-		s->memcg_params.memcg = memcg;
+	if (root_cache) {
 		s->memcg_params.root_cache = root_cache;
+		s->memcg_params.memcg = memcg;
+		INIT_LIST_HEAD(&s->memcg_params.children_node);
 		return 0;
 	}
 
@@ -223,7 +223,7 @@ int memcg_update_all_caches(int num_memcgs)
 
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
-	list_del(&s->memcg_params.list);
+	list_del(&s->memcg_params.children_node);
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -594,7 +594,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
-	list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+	list_add(&s->memcg_params.children_node,
+		 &root_cache->memcg_params.children);
 
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -690,7 +691,7 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 			 * list so as not to try to destroy it for a second
 			 * time while iterating over inactive caches below.
 			 */
-			list_move(&c->memcg_params.list, &busy);
+			list_move(&c->memcg_params.children_node, &busy);
 		else
 			/*
 			 * The cache is empty and will be destroyed soon. Clear
@@ -705,17 +706,17 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
 	 * Second, shutdown all caches left from memory cgroups that are now
 	 * offline.
 	 */
-	list_for_each_entry_safe(c, c2, &s->memcg_params.list,
-				 memcg_params.list)
+	list_for_each_entry_safe(c, c2, &s->memcg_params.children,
+				 memcg_params.children_node)
 		shutdown_cache(c);
 
-	list_splice(&busy, &s->memcg_params.list);
+	list_splice(&busy, &s->memcg_params.children);
 
 	/*
 	 * A cache being destroyed must be empty. In particular, this means
 	 * that all per memcg caches attached to it must be empty too.
 	 */
-	if (!list_empty(&s->memcg_params.list))
+	if (!list_empty(&s->memcg_params.children))
 		return -EBUSY;
 	return 0;
 }
-- 
cgit v1.2.3


From e2c79dcaa916e4044ec94770b66c890eef050346 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:25:58 +1100
Subject: slab: link memcg kmem_caches on their associated memory cgroup

With kmem cgroup support enabled, kmem_caches can be created and destroyed
frequently and a great number of near empty kmem_caches can accumulate if
there are a lot of transient cgroups and the system is not under memory
pressure.  When memory reclaim starts under such conditions, it can lead
to consecutive deactivation and destruction of many kmem_caches, easily
hundreds of thousands on moderately large systems, exposing scalability
issues in the current slab management code.  This is one of the patches to
address the issue.

While a memcg kmem_cache is listed on its root cache's ->children list,
there is no direct way to iterate all kmem_caches which are assocaited
with a memory cgroup.  The only way to iterate them is walking all caches
while filtering out caches which don't match, which would be most of them.

This makes memcg destruction operations O(N^2) where N is the total number
of slab caches which can be huge.  This combined with the synchronous RCU
operations can tie up a CPU and affect the whole machine for many hours
when memory reclaim triggers offlining and destruction of the stale
memcgs.

This patch adds mem_cgroup->kmem_caches list which goes through
memcg_cache_params->kmem_caches_node of all kmem_caches which are
associated with the memcg.  All memcg specific iterations, including stat
file access, are updated to use the new list instead.

Link: http://lkml.kernel.org/r/20170117235411.9408-6-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 include/linux/slab.h       |  3 +++
 mm/memcontrol.c            |  7 ++++---
 mm/slab.h                  |  3 +++
 mm/slab_common.c           | 36 +++++++++++++++++++++++++++++-------
 5 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 254698856b8f..9fcece9be85d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -253,6 +253,7 @@ struct mem_cgroup {
         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
 	enum memcg_kmem_state kmem_state;
+	struct list_head kmem_caches;
 #endif
 
 	int last_scanned_node;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1f611ba00f1d..a0cc7a77cda2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -565,6 +565,8 @@ struct memcg_cache_array {
  * @memcg:	Pointer to the memcg this cache belongs to.
  *
  * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
  */
 struct memcg_cache_params {
 	struct kmem_cache *root_cache;
@@ -576,6 +578,7 @@ struct memcg_cache_params {
 		struct {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
+			struct list_head kmem_caches_node;
 		};
 	};
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b822e158b319..834d641dfa8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 	 */
 	memcg->kmemcg_id = memcg_id;
 	memcg->kmem_state = KMEM_ONLINE;
+	INIT_LIST_HEAD(&memcg->kmem_caches);
 
 	return 0;
 }
@@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = {
 #ifdef CONFIG_SLABINFO
 	{
 		.name = "kmem.slabinfo",
-		.seq_start = slab_start,
-		.seq_next = slab_next,
-		.seq_stop = slab_stop,
+		.seq_start = memcg_slab_start,
+		.seq_next = memcg_slab_next,
+		.seq_stop = memcg_slab_stop,
 		.seq_show = memcg_slab_show,
 	},
 #endif
diff --git a/mm/slab.h b/mm/slab.h
index d50f919c2e8c..3a3689d8245d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -489,6 +489,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 void *slab_start(struct seq_file *m, loff_t *pos);
 void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
+void *memcg_slab_start(struct seq_file *m, loff_t *pos);
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
+void memcg_slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 40192d153cdc..122425e5ce60 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -154,6 +154,7 @@ static int init_memcg_params(struct kmem_cache *s,
 		s->memcg_params.root_cache = root_cache;
 		s->memcg_params.memcg = memcg;
 		INIT_LIST_HEAD(&s->memcg_params.children_node);
+		INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
 		return 0;
 	}
 
@@ -224,6 +225,7 @@ int memcg_update_all_caches(int num_memcgs)
 static void unlink_memcg_cache(struct kmem_cache *s)
 {
 	list_del(&s->memcg_params.children_node);
+	list_del(&s->memcg_params.kmem_caches_node);
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -596,6 +598,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	list_add(&s->memcg_params.children_node,
 		 &root_cache->memcg_params.children);
+	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
 
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
@@ -651,9 +654,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry_safe(s, s2, &slab_caches, list) {
-		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
-			continue;
+	list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
+				 memcg_params.kmem_caches_node) {
 		/*
 		 * The cgroup is about to be freed and therefore has no charges
 		 * left. Hence, all its caches must be empty by now.
@@ -1209,15 +1211,35 @@ static int slab_show(struct seq_file *m, void *p)
 }
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+void *memcg_slab_start(struct seq_file *m, loff_t *pos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	mutex_lock(&slab_mutex);
+	return seq_list_start(&memcg->kmem_caches, *pos);
+}
+
+void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	return seq_list_next(p, &memcg->kmem_caches, pos);
+}
+
+void memcg_slab_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&slab_mutex);
+}
+
 int memcg_slab_show(struct seq_file *m, void *p)
 {
-	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+	struct kmem_cache *s = list_entry(p, struct kmem_cache,
+					  memcg_params.kmem_caches_node);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
-	if (p == slab_caches.next)
+	if (p == memcg->kmem_caches.next)
 		print_slabinfo_header(m);
-	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 #endif
-- 
cgit v1.2.3


From 3df93c6de29b2a2d0aa9c0151c6e66be4afa2419 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:25:59 +1100
Subject: slab: implement slab_root_caches list

With kmem cgroup support enabled, kmem_caches can be created and destroyed
frequently and a great number of near empty kmem_caches can accumulate if
there are a lot of transient cgroups and the system is not under memory
pressure.  When memory reclaim starts under such conditions, it can lead
to consecutive deactivation and destruction of many kmem_caches, easily
hundreds of thousands on moderately large systems, exposing scalability
issues in the current slab management code.  This is one of the patches to
address the issue.

slab_caches currently lists all caches including root and memcg ones.
This is the only data structure which lists the root caches and iterating
root caches can only be done by walking the list while skipping over memcg
caches.  As there can be a huge number of memcg caches, this can become
very expensive.

This also can make /proc/slabinfo behave very badly.  seq_file processes
reads in 4k chunks and seeks to the previous Nth position on slab_caches
list to resume after each chunk.  With a lot of memcg cache churns on the
list, reading /proc/slabinfo can become very slow and its content often
ends up with duplicate and/or missing entries.

This patch adds a new list slab_root_caches which lists only the root
caches.  When memcg is not enabled, it becomes just an alias of
slab_caches.  memcg specific list operations are collected into
memcg_[un]link_cache().

Link: http://lkml.kernel.org/r/20170117235411.9408-7-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov@tarantool.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h |  3 +++
 mm/slab.h            | 15 +++++++++++++
 mm/slab_common.c     | 59 ++++++++++++++++++++++++++++++----------------------
 mm/slub.c            |  1 +
 4 files changed, 53 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a0cc7a77cda2..af1a5bef80f4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -556,6 +556,8 @@ struct memcg_cache_array {
  *		used to index child cachces during allocation and cleared
  *		early during shutdown.
  *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
  * @children:	List of all child caches.  While the child caches are also
  *		reachable through @memcg_caches, a child cache remains on
  *		this list until it is actually destroyed.
@@ -573,6 +575,7 @@ struct memcg_cache_params {
 	union {
 		struct {
 			struct memcg_cache_array __rcu *memcg_caches;
+			struct list_head __root_caches_node;
 			struct list_head children;
 		};
 		struct {
diff --git a/mm/slab.h b/mm/slab.h
index 3a3689d8245d..9cc9591e0bbe 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -196,6 +196,11 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+/* List of all root caches. */
+extern struct list_head		slab_root_caches;
+#define root_caches_node	memcg_params.__root_caches_node
+
 /*
  * Iterate over all memcg caches of the given root cache. The caller must hold
  * slab_mutex.
@@ -295,9 +300,14 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 }
 
 extern void slab_init_memcg_params(struct kmem_cache *);
+extern void memcg_link_cache(struct kmem_cache *s);
 
 #else /* CONFIG_MEMCG && !CONFIG_SLOB */
 
+/* If !memcg, all caches are root. */
+#define slab_root_caches	slab_caches
+#define root_caches_node	list
+
 #define for_each_memcg_cache(iter, root) \
 	for ((void)(iter), (void)(root); 0; )
 
@@ -342,6 +352,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order,
 static inline void slab_init_memcg_params(struct kmem_cache *s)
 {
 }
+
+static inline void memcg_link_cache(struct kmem_cache *s)
+{
+}
+
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 122425e5ce60..40c459a28dfd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -138,6 +138,9 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 }
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+
+LIST_HEAD(slab_root_caches);
+
 void slab_init_memcg_params(struct kmem_cache *s)
 {
 	s->memcg_params.root_cache = NULL;
@@ -183,9 +186,6 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
 	struct memcg_cache_array *old, *new;
 
-	if (!is_root_cache(s))
-		return 0;
-
 	new = kzalloc(sizeof(struct memcg_cache_array) +
 		      new_array_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
@@ -209,7 +209,7 @@ int memcg_update_all_caches(int num_memcgs)
 	int ret = 0;
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list) {
+	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
 		ret = update_memcg_params(s, num_memcgs);
 		/*
 		 * Instead of freeing the memory, we'll just leave the caches
@@ -222,10 +222,26 @@ int memcg_update_all_caches(int num_memcgs)
 	return ret;
 }
 
-static void unlink_memcg_cache(struct kmem_cache *s)
+void memcg_link_cache(struct kmem_cache *s)
+{
+	if (is_root_cache(s)) {
+		list_add(&s->root_caches_node, &slab_root_caches);
+	} else {
+		list_add(&s->memcg_params.children_node,
+			 &s->memcg_params.root_cache->memcg_params.children);
+		list_add(&s->memcg_params.kmem_caches_node,
+			 &s->memcg_params.memcg->kmem_caches);
+	}
+}
+
+static void memcg_unlink_cache(struct kmem_cache *s)
 {
-	list_del(&s->memcg_params.children_node);
-	list_del(&s->memcg_params.kmem_caches_node);
+	if (is_root_cache(s)) {
+		list_del(&s->root_caches_node);
+	} else {
+		list_del(&s->memcg_params.children_node);
+		list_del(&s->memcg_params.kmem_caches_node);
+	}
 }
 #else
 static inline int init_memcg_params(struct kmem_cache *s,
@@ -238,7 +254,7 @@ static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
 
-static inline void unlink_memcg_cache(struct kmem_cache *s)
+static inline void memcg_unlink_cache(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
@@ -285,7 +301,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
 	if (flags & SLAB_NEVER_MERGE)
 		return NULL;
 
-	list_for_each_entry_reverse(s, &slab_caches, list) {
+	list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
 		if (slab_unmergeable(s))
 			continue;
 
@@ -369,6 +385,7 @@ static struct kmem_cache *create_cache(const char *name,
 
 	s->refcount = 1;
 	list_add(&s->list, &slab_caches);
+	memcg_link_cache(s);
 out:
 	if (err)
 		return ERR_PTR(err);
@@ -514,9 +531,8 @@ static int shutdown_cache(struct kmem_cache *s)
 	if (__kmem_cache_shutdown(s) != 0)
 		return -EBUSY;
 
+	memcg_unlink_cache(s);
 	list_del(&s->list);
-	if (!is_root_cache(s))
-		unlink_memcg_cache(s);
 
 	if (s->flags & SLAB_DESTROY_BY_RCU) {
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
@@ -596,10 +612,6 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
 		goto out_unlock;
 	}
 
-	list_add(&s->memcg_params.children_node,
-		 &root_cache->memcg_params.children);
-	list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches);
-
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
@@ -627,10 +639,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	get_online_mems();
 
 	mutex_lock(&slab_mutex);
-	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
+	list_for_each_entry(s, &slab_root_caches, root_caches_node) {
 		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
 						lockdep_is_held(&slab_mutex));
 		c = arr->entries[idx];
@@ -829,6 +838,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
 
 	create_boot_cache(s, name, size, flags);
 	list_add(&s->list, &slab_caches);
+	memcg_link_cache(s);
 	s->refcount = 1;
 	return s;
 }
@@ -1144,12 +1154,12 @@ static void print_slabinfo_header(struct seq_file *m)
 void *slab_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&slab_mutex);
-	return seq_list_start(&slab_caches, *pos);
+	return seq_list_start(&slab_root_caches, *pos);
 }
 
 void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &slab_caches, pos);
+	return seq_list_next(p, &slab_root_caches, pos);
 }
 
 void slab_stop(struct seq_file *m, void *p)
@@ -1201,12 +1211,11 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
 
 static int slab_show(struct seq_file *m, void *p)
 {
-	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+	struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
 
-	if (p == slab_caches.next)
+	if (p == slab_root_caches.next)
 		print_slabinfo_header(m);
-	if (is_root_cache(s))
-		cache_show(s, m);
+	cache_show(s, m);
 	return 0;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index caac5456f0ec..03b012bcb5fa 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4127,6 +4127,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 	}
 	slab_init_memcg_params(s);
 	list_add(&s->list, &slab_caches);
+	memcg_link_cache(s);
 	return s;
 }
 
-- 
cgit v1.2.3


From e526ce7bac8263c793ea69057bf7b1e83e6a8845 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:26:00 +1100
Subject: slab: remove synchronous synchronize_sched() from memcg cache
 deactivation path

With kmem cgroup support enabled, kmem_caches can be created and destroyed
frequently and a great number of near empty kmem_caches can accumulate if
there are a lot of transient cgroups and the system is not under memory
pressure.  When memory reclaim starts under such conditions, it can lead
to consecutive deactivation and destruction of many kmem_caches, easily
hundreds of thousands on moderately large systems, exposing scalability
issues in the current slab management code.  This is one of the patches to
address the issue.

slub uses synchronize_sched() to deactivate a memcg cache.
synchronize_sched() is an expensive and slow operation and doesn't scale
when a huge number of caches are destroyed back-to-back.  While there used
to be a simple batching mechanism, the batching was too restricted to be
helpful.

This patch implements slab_deactivate_memcg_cache_rcu_sched() which slub
can use to schedule sched RCU callback instead of performing
synchronize_sched() synchronously while holding cgroup_mutex.  While this
adds online cpus, mems and slab_mutex operations, operating on these locks
back-to-back from the same kworker, which is what's gonna happen when
there are many to deactivate, isn't expensive at all and this gets rid of
the scalability problem completely.

Link: http://lkml.kernel.org/r/20170117235411.9408-9-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h |  6 ++++++
 mm/slab.h            |  2 ++
 mm/slab_common.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/slub.c            | 12 +++++++----
 4 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index af1a5bef80f4..3c37a8c51921 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -582,6 +582,12 @@ struct memcg_cache_params {
 			struct mem_cgroup *memcg;
 			struct list_head children_node;
 			struct list_head kmem_caches_node;
+
+			void (*deact_fn)(struct kmem_cache *);
+			union {
+				struct rcu_head deact_rcu_head;
+				struct work_struct deact_work;
+			};
 		};
 	};
 };
diff --git a/mm/slab.h b/mm/slab.h
index b2b110806246..360f157b1da2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -302,6 +302,8 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 
 extern void slab_init_memcg_params(struct kmem_cache *);
 extern void memcg_link_cache(struct kmem_cache *s);
+extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+				void (*deact_fn)(struct kmem_cache *));
 
 #else /* CONFIG_MEMCG && !CONFIG_SLOB */
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 57d873aca917..fe0cb0380a8d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -627,6 +627,66 @@ out_unlock:
 	put_online_cpus();
 }
 
+static void kmemcg_deactivate_workfn(struct work_struct *work)
+{
+	struct kmem_cache *s = container_of(work, struct kmem_cache,
+					    memcg_params.deact_work);
+
+	get_online_cpus();
+	get_online_mems();
+
+	mutex_lock(&slab_mutex);
+
+	s->memcg_params.deact_fn(s);
+
+	mutex_unlock(&slab_mutex);
+
+	put_online_mems();
+	put_online_cpus();
+
+	/* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
+	css_put(&s->memcg_params.memcg->css);
+}
+
+static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+{
+	struct kmem_cache *s = container_of(head, struct kmem_cache,
+					    memcg_params.deact_rcu_head);
+
+	/*
+	 * We need to grab blocking locks.  Bounce to ->deact_work.  The
+	 * work item shares the space with the RCU head and can't be
+	 * initialized eariler.
+	 */
+	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
+	schedule_work(&s->memcg_params.deact_work);
+}
+
+/**
+ * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
+ *					   sched RCU grace period
+ * @s: target kmem_cache
+ * @deact_fn: deactivation function to call
+ *
+ * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
+ * held after a sched RCU grace period.  The slab is guaranteed to stay
+ * alive until @deact_fn is finished.  This is to be used from
+ * __kmemcg_cache_deactivate().
+ */
+void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
+					   void (*deact_fn)(struct kmem_cache *))
+{
+	if (WARN_ON_ONCE(is_root_cache(s)) ||
+	    WARN_ON_ONCE(s->memcg_params.deact_fn))
+		return;
+
+	/* pin memcg so that @s doesn't get destroyed in the middle */
+	css_get(&s->memcg_params.memcg->css);
+
+	s->memcg_params.deact_fn = deact_fn;
+	call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+}
+
 void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 {
 	int idx;
diff --git a/mm/slub.c b/mm/slub.c
index 8a4591526f37..62d0b557a596 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3957,6 +3957,12 @@ int __kmem_cache_shrink(struct kmem_cache *s)
 }
 
 #ifdef CONFIG_MEMCG
+static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
+{
+	/* called with all the locks held after a sched RCU grace period */
+	__kmem_cache_shrink(s);
+}
+
 void __kmemcg_cache_deactivate(struct kmem_cache *s)
 {
 	/*
@@ -3968,11 +3974,9 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
 
 	/*
 	 * s->cpu_partial is checked locklessly (see put_cpu_partial), so
-	 * we have to make sure the change is visible.
+	 * we have to make sure the change is visible before shrinking.
 	 */
-	synchronize_sched();
-
-	__kmem_cache_shrink(s);
+	slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
 }
 #endif
 
-- 
cgit v1.2.3


From eb6fa2531b941eab10ef1987c573ecf1204e5a8c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Feb 2017 10:26:01 +1100
Subject: slab: use memcg_kmem_cache_wq for slab destruction operations

If there's contention on slab_mutex, queueing the per-cache destruction
work item on the system_wq can unnecessarily create and tie up a lot of
kworkers.

Rename memcg_kmem_cache_create_wq to memcg_kmem_cache_wq and make it
global and use that workqueue for the destruction work items too.  While
at it, convert the workqueue from an unbound workqueue to a per-cpu one
with concurrency limited to 1.  It's generally preferable to use per-cpu
workqueues and concurrency limit of 1 is safe enough.

This is suggested by Joonsoo Kim.

Link: http://lkml.kernel.org/r/20170117235411.9408-11-tj@kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jay Vana <jsvana@fb.com>
Acked-by: Vladimir Davydov <vdavydov@tarantool.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  1 +
 mm/memcontrol.c            | 16 ++++++++--------
 mm/slab_common.c           |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9fcece9be85d..5af377303880 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -830,6 +830,7 @@ void memcg_kmem_uncharge(struct page *page, int order);
 
 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
 extern struct static_key_false memcg_kmem_enabled_key;
+extern struct workqueue_struct *memcg_kmem_cache_wq;
 
 extern int memcg_nr_cache_ids;
 void memcg_get_cache_ids(void);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 834d641dfa8c..1fd6affcdde7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -317,6 +317,8 @@ void memcg_put_cache_ids(void)
 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
+struct workqueue_struct *memcg_kmem_cache_wq;
+
 #endif /* !CONFIG_SLOB */
 
 /**
@@ -2143,8 +2145,6 @@ struct memcg_kmem_cache_create_work {
 	struct work_struct work;
 };
 
-static struct workqueue_struct *memcg_kmem_cache_create_wq;
-
 static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
 	struct memcg_kmem_cache_create_work *cw =
@@ -2176,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 	cw->cachep = cachep;
 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
-	queue_work(memcg_kmem_cache_create_wq, &cw->work);
+	queue_work(memcg_kmem_cache_wq, &cw->work);
 }
 
 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -5778,12 +5778,12 @@ static int __init mem_cgroup_init(void)
 #ifndef CONFIG_SLOB
 	/*
 	 * Kmem cache creation is mostly done with the slab_mutex held,
-	 * so use a special workqueue to avoid stalling all worker
-	 * threads in case lots of cgroups are created simultaneously.
+	 * so use a workqueue with limited concurrency to avoid stalling
+	 * all worker threads in case lots of cgroups are created and
+	 * destroyed simultaneously.
 	 */
-	memcg_kmem_cache_create_wq =
-		alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
-	BUG_ON(!memcg_kmem_cache_create_wq);
+	memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
+	BUG_ON(!memcg_kmem_cache_wq);
 #endif
 
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fe0cb0380a8d..2a36518609b0 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -659,7 +659,7 @@ static void kmemcg_deactivate_rcufn(struct rcu_head *head)
 	 * initialized eariler.
 	 */
 	INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
-	schedule_work(&s->memcg_params.deact_work);
+	queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
 }
 
 /**
-- 
cgit v1.2.3


From 506cbe13537366c300e01392f90dfde4f1f76e99 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:04 +1100
Subject: mm, trace: extract COMPACTION_STATUS and ZONE_TYPE to a common header

COMPACTION_STATUS resp.  ZONE_TYPE are currently used to translate enum
compact_result resp.  struct zone index into their symbolic names for an
easier post processing.  The follow up patch would like to reuse this as
well.  The code involves some preprocessor black magic which is better not
duplicated elsewhere so move it to a common mm tracing relate header.

Link: http://lkml.kernel.org/r/20161220130135.15719-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/compaction.h | 60 ++----------------------------------
 include/trace/events/mmflags.h    | 64 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index cbdb90b6b308..0a18ab6483ff 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -9,62 +9,6 @@
 #include <linux/tracepoint.h>
 #include <trace/events/mmflags.h>
 
-#define COMPACTION_STATUS					\
-	EM( COMPACT_SKIPPED,		"skipped")		\
-	EM( COMPACT_DEFERRED,		"deferred")		\
-	EM( COMPACT_CONTINUE,		"continue")		\
-	EM( COMPACT_SUCCESS,		"success")		\
-	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
-	EM( COMPACT_COMPLETE,		"complete")		\
-	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
-	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
-	EMe(COMPACT_CONTENDED,		"contended")
-
-#ifdef CONFIG_ZONE_DMA
-#define IFDEF_ZONE_DMA(X) X
-#else
-#define IFDEF_ZONE_DMA(X)
-#endif
-
-#ifdef CONFIG_ZONE_DMA32
-#define IFDEF_ZONE_DMA32(X) X
-#else
-#define IFDEF_ZONE_DMA32(X)
-#endif
-
-#ifdef CONFIG_HIGHMEM
-#define IFDEF_ZONE_HIGHMEM(X) X
-#else
-#define IFDEF_ZONE_HIGHMEM(X)
-#endif
-
-#define ZONE_TYPE						\
-	IFDEF_ZONE_DMA(		EM (ZONE_DMA,	 "DMA"))	\
-	IFDEF_ZONE_DMA32(	EM (ZONE_DMA32,	 "DMA32"))	\
-				EM (ZONE_NORMAL, "Normal")	\
-	IFDEF_ZONE_HIGHMEM(	EM (ZONE_HIGHMEM,"HighMem"))	\
-				EMe(ZONE_MOVABLE,"Movable")
-
-/*
- * First define the enums in the above macros to be exported to userspace
- * via TRACE_DEFINE_ENUM().
- */
-#undef EM
-#undef EMe
-#define EM(a, b)	TRACE_DEFINE_ENUM(a);
-#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
-
-COMPACTION_STATUS
-ZONE_TYPE
-
-/*
- * Now redefine the EM() and EMe() macros to map the enums to the strings
- * that will be printed in the output.
- */
-#undef EM
-#undef EMe
-#define EM(a, b)	{a, b},
-#define EMe(a, b)	{a, b}
 
 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
 
@@ -187,6 +131,7 @@ TRACE_EVENT(mm_compaction_begin,
 		__entry->sync ? "sync" : "async")
 );
 
+#ifdef CONFIG_COMPACTION
 TRACE_EVENT(mm_compaction_end,
 	TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
 		unsigned long free_pfn, unsigned long zone_end, bool sync,
@@ -220,6 +165,7 @@ TRACE_EVENT(mm_compaction_end,
 		__entry->sync ? "sync" : "async",
 		__print_symbolic(__entry->status, COMPACTION_STATUS))
 );
+#endif
 
 TRACE_EVENT(mm_compaction_try_to_compact_pages,
 
@@ -248,6 +194,7 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
 		__entry->prio)
 );
 
+#ifdef CONFIG_COMPACTION
 DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
 
 	TP_PROTO(struct zone *zone,
@@ -295,7 +242,6 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
 	TP_ARGS(zone, order, ret)
 );
 
-#ifdef CONFIG_COMPACTION
 DECLARE_EVENT_CLASS(mm_compaction_defer_template,
 
 	TP_PROTO(struct zone *zone, int order),
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 15bf875d0e4a..75ed3220ede2 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -1,3 +1,6 @@
+#include <linux/node.h>
+#include <linux/mmzone.h>
+#include <linux/compaction.h>
 /*
  * The order of these masks is important. Matching masks will be seen
  * first and the left over flags will end up showing by themselves.
@@ -171,3 +174,64 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	(flags) ? __print_flags(flags, "|",				\
 	__def_vmaflag_names						\
 	) : "none"
+
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_STATUS					\
+	EM( COMPACT_SKIPPED,		"skipped")		\
+	EM( COMPACT_DEFERRED,		"deferred")		\
+	EM( COMPACT_CONTINUE,		"continue")		\
+	EM( COMPACT_SUCCESS,		"success")		\
+	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
+	EM( COMPACT_COMPLETE,		"complete")		\
+	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
+	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
+	EMe(COMPACT_CONTENDED,		"contended")
+#else
+#define COMPACTION_STATUS
+#endif
+
+#ifdef CONFIG_ZONE_DMA
+#define IFDEF_ZONE_DMA(X) X
+#else
+#define IFDEF_ZONE_DMA(X)
+#endif
+
+#ifdef CONFIG_ZONE_DMA32
+#define IFDEF_ZONE_DMA32(X) X
+#else
+#define IFDEF_ZONE_DMA32(X)
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#define IFDEF_ZONE_HIGHMEM(X) X
+#else
+#define IFDEF_ZONE_HIGHMEM(X)
+#endif
+
+#define ZONE_TYPE						\
+	IFDEF_ZONE_DMA(		EM (ZONE_DMA,	 "DMA"))	\
+	IFDEF_ZONE_DMA32(	EM (ZONE_DMA32,	 "DMA32"))	\
+				EM (ZONE_NORMAL, "Normal")	\
+	IFDEF_ZONE_HIGHMEM(	EM (ZONE_HIGHMEM,"HighMem"))	\
+				EMe(ZONE_MOVABLE,"Movable")
+
+/*
+ * First define the enums in the above macros to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
+
+COMPACTION_STATUS
+ZONE_TYPE
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)	{a, b},
+#define EMe(a, b)	{a, b}
-- 
cgit v1.2.3


From f0615c00b69e4a5d910f64ae29e3349cb0043d25 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:05 +1100
Subject: oom, trace: add oom detection tracepoints

should_reclaim_retry is the central decision point for declaring the OOM.
It might be really useful to expose data used for this decision making
when debugging an unexpected oom situations.

Say we have an OOM report:
[   52.264001] mem_eater invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_score_adj=0
[   52.267549] CPU: 3 PID: 3148 Comm: mem_eater Tainted: G        W       4.8.0-oomtrace3-00006-gb21338b386d2 #1024

Now we can check the tracepoint data to see how we have ended up in this
situation:
       mem_eater-3148  [003] ....    52.432801: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11134 min_wmark=11084 no_progress_loops=1 wmark_check=1
       mem_eater-3148  [003] ....    52.433269: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11103 min_wmark=11084 no_progress_loops=1 wmark_check=1
       mem_eater-3148  [003] ....    52.433712: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11100 min_wmark=11084 no_progress_loops=2 wmark_check=1
       mem_eater-3148  [003] ....    52.434067: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11097 min_wmark=11084 no_progress_loops=3 wmark_check=1
       mem_eater-3148  [003] ....    52.434414: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11094 min_wmark=11084 no_progress_loops=4 wmark_check=1
       mem_eater-3148  [003] ....    52.434761: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11091 min_wmark=11084 no_progress_loops=5 wmark_check=1
       mem_eater-3148  [003] ....    52.435108: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11087 min_wmark=11084 no_progress_loops=6 wmark_check=1
       mem_eater-3148  [003] ....    52.435478: reclaim_retry_zone: node=0 zone=DMA32 order=0 reclaimable=51 available=11084 min_wmark=11084 no_progress_loops=7 wmark_check=0
       mem_eater-3148  [003] ....    52.435478: reclaim_retry_zone: node=0 zone=DMA order=0 reclaimable=0 available=1126 min_wmark=179 no_progress_loops=7 wmark_check=0

The above shows that we can quickly deduce that the reclaim stopped making
any progress (see no_progress_loops increased in each round) and while
there were still some 51 reclaimable pages they couldn't be dropped for
some reason (vmscan trace points would tell us more about that part).
available will represent reclaimable + free_pages scaled down per
no_progress_loops factor.  This is essentially an optimistic estimate of
how much memory we would have when reclaiming everything.  This can be
compared to min_wmark to get a rought idea but the wmark_check tells the
result of the watermark check which is more precise (includes lowmem
reserves, considers the order etc.).  As we can see no zone is eligible in
the end and that is why we have triggered the oom in this situation.

Please note that higher order requests might fail on the wmark_check even
when there is much more memory available than min_wmark - e.g.  when the
memory is fragmented.  A follow up tracepoint will help to debug those
situations.

Link: http://lkml.kernel.org/r/20161220130135.15719-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/oom.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c            | 10 ++++++++--
 2 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 1e974983757e..9160da7a26a0 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -4,6 +4,7 @@
 #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_OOM_H
 #include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
 
 TRACE_EVENT(oom_score_adj_update,
 
@@ -27,6 +28,47 @@ TRACE_EVENT(oom_score_adj_update,
 		__entry->pid, __entry->comm, __entry->oom_score_adj)
 );
 
+TRACE_EVENT(reclaim_retry_zone,
+
+	TP_PROTO(struct zoneref *zoneref,
+		int order,
+		unsigned long reclaimable,
+		unsigned long available,
+		unsigned long min_wmark,
+		int no_progress_loops,
+		bool wmark_check),
+
+	TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check),
+
+	TP_STRUCT__entry(
+		__field(	int, node)
+		__field(	int, zone_idx)
+		__field(	int,	order)
+		__field(	unsigned long,	reclaimable)
+		__field(	unsigned long,	available)
+		__field(	unsigned long,	min_wmark)
+		__field(	int,	no_progress_loops)
+		__field(	bool,	wmark_check)
+	),
+
+	TP_fast_assign(
+		__entry->node = zone_to_nid(zoneref->zone);
+		__entry->zone_idx = zoneref->zone_idx;
+		__entry->order = order;
+		__entry->reclaimable = reclaimable;
+		__entry->available = available;
+		__entry->min_wmark = min_wmark;
+		__entry->no_progress_loops = no_progress_loops;
+		__entry->wmark_check = wmark_check;
+	),
+
+	TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d",
+			__entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE),
+			__entry->order,
+			__entry->reclaimable, __entry->available, __entry->min_wmark,
+			__entry->no_progress_loops,
+			__entry->wmark_check)
+);
 #endif
 
 /* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index af65c4eedc79..d20f8c3139bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,6 +55,7 @@
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
+#include <trace/events/oom.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
@@ -3468,6 +3469,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 					ac->nodemask) {
 		unsigned long available;
 		unsigned long reclaimable;
+		unsigned long min_wmark = min_wmark_pages(zone);
+		bool wmark;
 
 		available = reclaimable = zone_reclaimable_pages(zone);
 		available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3478,8 +3481,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		 * Would the allocation succeed if we reclaimed the whole
 		 * available?
 		 */
-		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
-				ac_classzone_idx(ac), alloc_flags, available)) {
+		wmark = __zone_watermark_ok(zone, order, min_wmark,
+				ac_classzone_idx(ac), alloc_flags, available);
+		trace_reclaim_retry_zone(z, order, reclaimable,
+				available, min_wmark, *no_progress_loops, wmark);
+		if (wmark) {
 			/*
 			 * If we didn't make any progress and have a lot of
 			 * dirty + writeback pages then we should wait for
-- 
cgit v1.2.3


From e2df9e7b212f62e3733abc44cf3a1a364cc68b0f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:05 +1100
Subject: oom, trace: add compaction retry tracepoint

Higher order requests oom debugging is currently quite hard.  We do have
some compaction points which can tell us how the compaction is operating
but there is no trace point to tell us about compaction retry logic.  This
patch adds a one which will have the following format

            bash-3126  [001] ....  1498.220001: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=withdrawn retries=0 max_retries=16 should_retry=0

we can see that the order 9 request is not retried even though we are in
the highest compaction priority mode becase the last compaction attempt
was withdrawn.  This means that compaction_zonelist_suitable must have
returned false and there is no suitable zone to compact for this request
and so no need to retry further.

another example would be
           <...>-3137  [001] ....    81.501689: compact_retry: order=9 priority=COMPACT_PRIO_SYNC_LIGHT compaction_result=failed retries=0 max_retries=16 should_retry=0

in this case the order-9 compaction failed to find any suitable block.  We
do not retry anymore because this is a costly request and those do not go
below COMPACT_PRIO_SYNC_LIGHT priority.

Link: http://lkml.kernel.org/r/20161220130135.15719-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/mmflags.h | 26 ++++++++++++++++++++++++++
 include/trace/events/oom.h     | 39 +++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c                | 22 ++++++++++++++++------
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 75ed3220ede2..17c45bcf8f92 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -186,8 +186,32 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
 	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
 	EMe(COMPACT_CONTENDED,		"contended")
+
+/* High-level compaction status feedback */
+#define COMPACTION_FAILED	1
+#define COMPACTION_WITHDRAWN	2
+#define COMPACTION_PROGRESS	3
+
+#define compact_result_to_feedback(result)	\
+({						\
+ 	enum compact_result __result = result;	\
+	(compaction_failed(__result)) ? COMPACTION_FAILED : \
+		(compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \
+})
+
+#define COMPACTION_FEEDBACK		\
+	EM(COMPACTION_FAILED,		"failed")	\
+	EM(COMPACTION_WITHDRAWN,	"withdrawn")	\
+	EMe(COMPACTION_PROGRESS,	"progress")
+
+#define COMPACTION_PRIORITY						\
+	EM(COMPACT_PRIO_SYNC_FULL,	"COMPACT_PRIO_SYNC_FULL")	\
+	EM(COMPACT_PRIO_SYNC_LIGHT,	"COMPACT_PRIO_SYNC_LIGHT")	\
+	EMe(COMPACT_PRIO_ASYNC,		"COMPACT_PRIO_ASYNC")
 #else
 #define COMPACTION_STATUS
+#define COMPACTION_PRIORITY
+#define COMPACTION_FEEDBACK
 #endif
 
 #ifdef CONFIG_ZONE_DMA
@@ -225,6 +249,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 #define EMe(a, b)	TRACE_DEFINE_ENUM(a);
 
 COMPACTION_STATUS
+COMPACTION_PRIORITY
+COMPACTION_FEEDBACK
 ZONE_TYPE
 
 /*
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index 9160da7a26a0..38baeb27221a 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -69,6 +69,45 @@ TRACE_EVENT(reclaim_retry_zone,
 			__entry->no_progress_loops,
 			__entry->wmark_check)
 );
+
+#ifdef CONFIG_COMPACTION
+TRACE_EVENT(compact_retry,
+
+	TP_PROTO(int order,
+		enum compact_priority priority,
+		enum compact_result result,
+		int retries,
+		int max_retries,
+		bool ret),
+
+	TP_ARGS(order, priority, result, retries, max_retries, ret),
+
+	TP_STRUCT__entry(
+		__field(	int, order)
+		__field(	int, priority)
+		__field(	int, result)
+		__field(	int, retries)
+		__field(	int, max_retries)
+		__field(	bool, ret)
+	),
+
+	TP_fast_assign(
+		__entry->order = order;
+		__entry->priority = priority;
+		__entry->result = compact_result_to_feedback(result);
+		__entry->retries = retries;
+		__entry->max_retries = max_retries;
+		__entry->ret = ret;
+	),
+
+	TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d",
+			__entry->order,
+			__print_symbolic(__entry->priority, COMPACTION_PRIORITY),
+			__print_symbolic(__entry->result, COMPACTION_FEEDBACK),
+			__entry->retries, __entry->max_retries,
+			__entry->ret)
+);
+#endif /* CONFIG_COMPACTION */
 #endif
 
 /* This part must be outside protection */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d20f8c3139bb..05c0a59323bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3197,6 +3197,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 {
 	int max_retries = MAX_COMPACT_RETRIES;
 	int min_priority;
+	bool ret = false;
+	int retries = *compaction_retries;
+	enum compact_priority priority = *compact_priority;
 
 	if (!order)
 		return false;
@@ -3218,8 +3221,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 * But do not retry if the given zonelist is not suitable for
 	 * compaction.
 	 */
-	if (compaction_withdrawn(compact_result))
-		return compaction_zonelist_suitable(ac, order, alloc_flags);
+	if (compaction_withdrawn(compact_result)) {
+		ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+		goto out;
+	}
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3231,8 +3236,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 	 */
 	if (order > PAGE_ALLOC_COSTLY_ORDER)
 		max_retries /= 4;
-	if (*compaction_retries <= max_retries)
-		return true;
+	if (*compaction_retries <= max_retries) {
+		ret = true;
+		goto out;
+	}
 
 	/*
 	 * Make sure there are attempts at the highest priority if we exhausted
@@ -3241,12 +3248,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 check_priority:
 	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
 			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
+
 	if (*compact_priority > min_priority) {
 		(*compact_priority)--;
 		*compaction_retries = 0;
-		return true;
+		ret = true;
 	}
-	return false;
+out:
+	trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+	return ret;
 }
 #else
 static inline struct page *
-- 
cgit v1.2.3


From 2a530ac3db3f1919f49088f08324d87d59bbb18b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 8 Feb 2017 10:26:06 +1100
Subject: userfaultfd: document _IOR/_IOW

Patch series "userfaultfd tmpfs/hugetlbfs/non-cooperative", v2

These userfaultfd features are finished and are ready for larger
exposure in -mm and upstream merging.

1) tmpfs non present userfault
2) hugetlbfs non present userfault
3) non cooperative userfault for fork/madvise/mremap

qemu development code is already exercising 2) and container postcopy
live migration needs 3).

1) is not currently used but there's a self test and we know some qemu
user for various reasons uses tmpfs as backing for KVM so it'll need
it too to use postcopy live migration with tmpfs memory.

All review feedback from the previous submit has been handled and the
fixes are included.  There's no outstanding issue AFIK.

Upstream code just did a s/fe/vmf/ conversion in the page faults and this
has been converted as well incrementally.

In addition to the previous submits, this also wakes up stuck userfaults
during UFFDIO_UNREGISTER.  The non cooperative testcase actually
reproduced this problem by getting stuck instead of quitting clean in some
rare case as it could call UFFDIO_UNREGISTER while some userfault could be
still in flight.  The other option would have been to keep leaving it up
to userland to serialize itself and to patch the testcase instead but the
wakeup during unregister I think is preferable.

David also asked the UFFD_FEATURE_MISSING_HUGETLBFS and
UFFD_FEATURE_MISSING_SHMEM feature flags to be added so QEMU can avoid to
probe if the hugetlbfs/shmem missing support is available by calling
UFFDIO_REGISTER.  QEMU already checks HUGETLBFS_MAGIC with fstatfs so if
UFFD_FEATURE_MISSING_HUGETLBFS is also set, it knows UFFDIO_REGISTER will
succeed (or if it fails, it's for some other more concerning reason).
There's no reason to worry about adding too many feature flags.  There are
64 available and worst case we've to bump the API if someday we're really
going to run out of them.

The round-trip network latency of hugetlbfs userfaults during postcopy
live migration is still of the order of dozen milliseconds on 10GBit if at
2MB hugepage granularity so it's working perfectly and it should provide
for higher bandwidth or lower CPU usage (which makes it interesting to add
an option in the future to support THP granularity too for anonymous
memory, UFFDIO_COPY would then have to create THP if alignment/len allows
for it).  1GB hugetlbfs granularity will require big changes in hugetlbfs
to work so it's deferred for later.

This patch (of 42):

This adds proper documentation (inline) to avoid the risk of further
misunderstandings about the semantics of _IOW/_IOR and it also reminds
whoever will bump the UFFDIO_API in the future, to change the two ioctl to
_IOW.

This was found while implementing strace support for those ioctl,
otherwise we could have never found it by just reviewing kernel code and
testing it.

_IOC_READ or _IOC_WRITE alters nothing but the ioctl number itself, so
it's only worth fixing if the UFFDIO_API is bumped someday.

Link: http://lkml.kernel.org/r/20161216144821.5183-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/asm-generic/ioctl.h | 10 +++++++++-
 include/uapi/linux/userfaultfd.h |  6 ++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/asm-generic/ioctl.h b/include/uapi/asm-generic/ioctl.h
index 7e7c11b52143..749b32fe5623 100644
--- a/include/uapi/asm-generic/ioctl.h
+++ b/include/uapi/asm-generic/ioctl.h
@@ -48,6 +48,9 @@
 /*
  * Direction bits, which any architecture can choose to override
  * before including this file.
+ *
+ * NOTE: _IOC_WRITE means userland is writing and kernel is
+ * reading. _IOC_READ means userland is reading and kernel is writing.
  */
 
 #ifndef _IOC_NONE
@@ -72,7 +75,12 @@
 #define _IOC_TYPECHECK(t) (sizeof(t))
 #endif
 
-/* used to create numbers */
+/*
+ * Used to create numbers.
+ *
+ * NOTE: _IOW means userland is writing and kernel is reading. _IOR
+ * means userland is reading and kernel is writing.
+ */
 #define _IO(type,nr)		_IOC(_IOC_NONE,(type),(nr),0)
 #define _IOR(type,nr,size)	_IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size)))
 #define _IOW(type,nr,size)	_IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9057d7af3ae1..94046b8aa6ad 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,6 +11,12 @@
 
 #include <linux/types.h>
 
+/*
+ * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
+ * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR.  In
+ * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ
+ * means the userland is reading).
+ */
 #define UFFD_API ((__u64)0xAA)
 /*
  * After implementing the respective features it will become:
-- 
cgit v1.2.3


From 2eb7a4205067c18d9615e1442633a3f03355f08f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 8 Feb 2017 10:26:09 +1100
Subject: userfaultfd: non-cooperative: Add fork() event

When the mm with uffd-ed vmas fork()-s the respective vmas notify their
uffds with the event which contains a descriptor with new uffd.  This new
descriptor can then be used to get events from the child and populate its
mm with data.  Note, that there can be different uffd-s controlling
different vmas within one mm, so first we should collect all those uffds
(and ctx-s) in a list and then notify them all one by one but only once
per fork().

The context is created at fork() time but the descriptor, file struct and
anon inode object is created at event read time.  So some trickery is
added to the userfaultfd_ctx_read() to handle the ctx queues' locking vs
file creation.

Another thing worth noticing is that the task that fork()-s waits for the
uffd event to get processed WITHOUT the mmap sem.

Link: http://lkml.kernel.org/r/20161216144821.5183-9-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 146 ++++++++++++++++++++++++++++++++++++++-
 include/linux/userfaultfd_k.h    |  13 ++++
 include/uapi/linux/userfaultfd.h |  15 ++--
 kernel/fork.c                    |  10 ++-
 4 files changed, 168 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87d31921b66c..8c91cf8e8999 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -64,6 +64,12 @@ struct userfaultfd_ctx {
 	struct mm_struct *mm;
 };
 
+struct userfaultfd_fork_ctx {
+	struct userfaultfd_ctx *orig;
+	struct userfaultfd_ctx *new;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -465,9 +471,8 @@ out:
 	return ret;
 }
 
-static int __maybe_unused userfaultfd_event_wait_completion(
-		struct userfaultfd_ctx *ctx,
-		struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+					     struct userfaultfd_wait_queue *ewq)
 {
 	int ret = 0;
 
@@ -518,6 +523,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 }
 
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+	struct userfaultfd_ctx *ctx = NULL, *octx;
+	struct userfaultfd_fork_ctx *fctx;
+
+	octx = vma->vm_userfaultfd_ctx.ctx;
+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		return 0;
+	}
+
+	list_for_each_entry(fctx, fcs, list)
+		if (fctx->orig == octx) {
+			ctx = fctx->new;
+			break;
+		}
+
+	if (!ctx) {
+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+		if (!fctx)
+			return -ENOMEM;
+
+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx) {
+			kfree(fctx);
+			return -ENOMEM;
+		}
+
+		atomic_set(&ctx->refcount, 1);
+		ctx->flags = octx->flags;
+		ctx->state = UFFD_STATE_RUNNING;
+		ctx->features = octx->features;
+		ctx->released = false;
+		ctx->mm = vma->vm_mm;
+		atomic_inc(&ctx->mm->mm_users);
+
+		userfaultfd_ctx_get(octx);
+		fctx->orig = octx;
+		fctx->new = ctx;
+		list_add_tail(&fctx->list, fcs);
+	}
+
+	vma->vm_userfaultfd_ctx.ctx = ctx;
+	return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+	struct userfaultfd_ctx *ctx = fctx->orig;
+	struct userfaultfd_wait_queue ewq;
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_FORK;
+	ewq.msg.arg.reserved.reserved1 = (__u64)fctx->new;
+
+	return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+	int ret = 0;
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		if (!ret)
+			ret = dup_fctx(fctx);
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -653,12 +731,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	}
 }
 
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+				  struct userfaultfd_ctx *new,
+				  struct uffd_msg *msg)
+{
+	int fd;
+	struct file *file;
+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+				  O_RDWR | flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	msg->arg.reserved.reserved1 = 0;
+	msg->arg.fork.ufd = fd;
+
+	return 0;
+}
+
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
 	struct userfaultfd_wait_queue *uwq;
+	/*
+	 * Handling fork event requires sleeping operations, so
+	 * we drop the event_wqh lock, then do these ops, then
+	 * lock it back and wake up the waiter. While the lock is
+	 * dropped the ewq may go away so we keep track of it
+	 * carefully.
+	 */
+	LIST_HEAD(fork_event);
+	struct userfaultfd_ctx *fork_nctx = NULL;
 
 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
@@ -716,6 +831,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		if (uwq) {
 			*msg = uwq->msg;
 
+			if (uwq->msg.event == UFFD_EVENT_FORK) {
+				fork_nctx = (struct userfaultfd_ctx *)uwq->msg.arg.reserved.reserved1;
+				list_move(&uwq->wq.task_list, &fork_event);
+				spin_unlock(&ctx->event_wqh.lock);
+				ret = 0;
+				break;
+			}
+
 			userfaultfd_event_complete(ctx, uwq);
 			spin_unlock(&ctx->event_wqh.lock);
 			ret = 0;
@@ -739,6 +862,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->fd_wqh.lock);
 
+	if (!ret && msg->event == UFFD_EVENT_FORK) {
+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+		if (!ret) {
+			spin_lock(&ctx->event_wqh.lock);
+			if (!list_empty(&fork_event)) {
+				uwq = list_first_entry(&fork_event,
+						       typeof(*uwq),
+						       wq.task_list);
+				list_del(&uwq->wq.task_list);
+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				userfaultfd_event_complete(ctx, uwq);
+			}
+			spin_unlock(&ctx->event_wqh.lock);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 11b92b047a1e..79002bca1f43 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
 }
 
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -76,6 +79,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline int dup_userfaultfd(struct vm_area_struct *vma,
+				  struct list_head *l)
+{
+	return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *l)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 94046b8aa6ad..c8953c84fdcc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,12 +18,7 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-/*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *			      UFFD_FEATURE_EVENT_FORK)
- */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -77,6 +72,10 @@ struct uffd_msg {
 			__u64	address;
 		} pagefault;
 
+		struct {
+			__u32	ufd;
+		} fork;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,9 +89,7 @@ struct uffd_msg {
  * Start at 0x12 and not at 0 to be more strict against bugs.
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
-#if 0 /* not available yet */
 #define UFFD_EVENT_FORK		0x13
-#endif
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -111,10 +108,8 @@ struct uffdio_api {
 	 * are to be considered implicitly always enabled in all kernels as
 	 * long as the uffdio_api.api requested matches UFFD_API.
 	 */
-#if 0 /* not available yet */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
-#endif
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8ab827c..022e3b9c2a51 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 #include <linux/rmap.h>
 #include <linux/ksm.h>
 #include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
@@ -559,6 +560,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
+	LIST_HEAD(uf);
 
 	uprobe_start_dup_mmap();
 	if (down_write_killable(&oldmm->mmap_sem)) {
@@ -615,12 +617,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (retval)
 			goto fail_nomem_policy;
 		tmp->vm_mm = mm;
+		retval = dup_userfaultfd(tmp, &uf);
+		if (retval)
+			goto fail_nomem_anon_vma_fork;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &=
-			~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
-		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -676,6 +679,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+	dup_userfaultfd_complete(&uf);
 fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
-- 
cgit v1.2.3


From db435fc89e0b9c4b09d5793c4384c6069facdc13 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 8 Feb 2017 10:26:10 +1100
Subject: userfaultfd: non-cooperative: add mremap() event

The event denotes that an area [start:end] moves to different location.
Length change isn't reported as "new" addresses, if they appear on the
uffd reader side they will not contain any data and the latter can just
zeromap them.

Waiting for the event ACK is also done outside of mmap sem, as
for fork event.

Link: http://lkml.kernel.org/r/20161216144821.5183-12-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    | 17 +++++++++++++++++
 include/uapi/linux/userfaultfd.h | 11 ++++++++++-
 mm/mremap.c                      | 17 ++++++++++++-----
 4 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 27978f249016..68f978beefac 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -596,6 +596,43 @@ void dup_userfaultfd_complete(struct list_head *fcs)
 	}
 }
 
+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+			     struct vm_userfaultfd_ctx *vm_ctx)
+{
+	struct userfaultfd_ctx *ctx;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+		vm_ctx->ctx = ctx;
+		userfaultfd_ctx_get(ctx);
+	}
+}
+
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx,
+				 unsigned long from, unsigned long to,
+				 unsigned long len)
+{
+	struct userfaultfd_ctx *ctx = vm_ctx.ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	if (!ctx)
+		return;
+
+	if (to & ~PAGE_MASK) {
+		userfaultfd_ctx_put(ctx);
+		return;
+	}
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_REMAP;
+	ewq.msg.arg.remap.from = from;
+	ewq.msg.arg.remap.to = to;
+	ewq.msg.arg.remap.len = len;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 79002bca1f43..7f318a46044b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -55,6 +55,12 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
 extern void dup_userfaultfd_complete(struct list_head *);
 
+extern void mremap_userfaultfd_prep(struct vm_area_struct *,
+				    struct vm_userfaultfd_ctx *);
+extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx,
+					unsigned long from, unsigned long to,
+					unsigned long len);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -89,6 +95,17 @@ static inline void dup_userfaultfd_complete(struct list_head *l)
 {
 }
 
+static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+					   struct vm_userfaultfd_ctx *ctx)
+{
+}
+
+static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx,
+					       unsigned long from,
+					       unsigned long to,
+					       unsigned long len)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index c8953c84fdcc..79a85e5bd388 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
+			   UFFD_FEATURE_EVENT_REMAP)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -76,6 +77,12 @@ struct uffd_msg {
 			__u32	ufd;
 		} fork;
 
+		struct {
+			__u64	from;
+			__u64	to;
+			__u64	len;
+		} remap;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -90,6 +97,7 @@ struct uffd_msg {
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
+#define UFFD_EVENT_REMAP	0x14
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -110,6 +118,7 @@ struct uffdio_api {
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/mremap.c b/mm/mremap.c
index 30d7d2482eea..504b560c013c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/mm-arch-hooks.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
-		unsigned long new_len, unsigned long new_addr, bool *locked)
+		unsigned long new_len, unsigned long new_addr,
+		bool *locked, struct vm_userfaultfd_ctx *uf)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		old_addr = new_addr;
 		new_addr = err;
 	} else {
+		mremap_userfaultfd_prep(new_vma, uf);
 		arch_remap(mm, old_addr, old_addr + old_len,
 			   new_addr, new_addr + new_len);
 	}
@@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-		unsigned long new_addr, unsigned long new_len, bool *locked)
+		unsigned long new_addr, unsigned long new_len, bool *locked,
+		struct vm_userfaultfd_ctx *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
 	if (!(offset_in_page(ret)))
 		goto out;
 out1:
@@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
 	bool locked = false;
+	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
@@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked);
+				&locked, &uf);
 		goto out;
 	}
 
@@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 			goto out;
 		}
 
-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+		ret = move_vma(vma, addr, old_len, new_len, new_addr,
+			       &locked, &uf);
 	}
 out:
 	if (offset_in_page(ret)) {
@@ -602,5 +608,6 @@ out:
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
+	mremap_userfaultfd_complete(uf, addr, new_addr, old_len);
 	return ret;
 }
-- 
cgit v1.2.3


From 11c39093e1dccdd1598093c601b245a3e47a235a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 8 Feb 2017 10:26:10 +1100
Subject: userfaultfd: non-cooperative: optimize mremap_userfaultfd_complete()

Optimize the mremap_userfaultfd_complete() interface to pass only the
vm_userfaultfd_ctx pointer through the stack as a microoptimization.

Link: http://lkml.kernel.org/r/20161216144821.5183-13-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 4 ++--
 include/linux/userfaultfd_k.h | 4 ++--
 mm/mremap.c                   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 68f978beefac..5d37c37854b0 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -608,11 +608,11 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	}
 }
 
-void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx vm_ctx,
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 				 unsigned long from, unsigned long to,
 				 unsigned long len)
 {
-	struct userfaultfd_ctx *ctx = vm_ctx.ctx;
+	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
 	struct userfaultfd_wait_queue ewq;
 
 	if (!ctx)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 7f318a46044b..78ec197e8b47 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -57,7 +57,7 @@ extern void dup_userfaultfd_complete(struct list_head *);
 
 extern void mremap_userfaultfd_prep(struct vm_area_struct *,
 				    struct vm_userfaultfd_ctx *);
-extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx,
+extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
@@ -100,7 +100,7 @@ static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 {
 }
 
-static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx ctx,
+static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 					       unsigned long from,
 					       unsigned long to,
 					       unsigned long len)
diff --git a/mm/mremap.c b/mm/mremap.c
index 504b560c013c..8779928d6a70 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -608,6 +608,6 @@ out:
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
-	mremap_userfaultfd_complete(uf, addr, new_addr, old_len);
+	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
 	return ret;
 }
-- 
cgit v1.2.3


From d17ecc47e7ecd8603bde4df2a216d0cb5294e4a5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 8 Feb 2017 10:26:11 +1100
Subject: userfaultfd: non-cooperative: add madvise() event for MADV_DONTNEED
 request

If the page is punched out of the address space the uffd reader should
know this and zeromap the respective area in case of the #PF event.

Link: http://lkml.kernel.org/r/20161216144821.5183-14-aarcange@redhat.com
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 28 ++++++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    | 12 ++++++++++++
 include/uapi/linux/userfaultfd.h | 10 +++++++++-
 mm/madvise.c                     |  2 ++
 4 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5d37c37854b0..ea9008254df4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -633,6 +633,34 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
+void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				struct vm_area_struct **prev,
+				unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+		return;
+
+	userfaultfd_ctx_get(ctx);
+	up_read(&mm->mmap_sem);
+
+	*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
+	ewq.msg.arg.madv_dn.start = start;
+	ewq.msg.arg.madv_dn.end = end;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+
+	down_read(&mm->mmap_sem);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 78ec197e8b47..f431861f22f1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,6 +61,11 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
+extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				       struct vm_area_struct **prev,
+				       unsigned long start,
+				       unsigned long end);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -106,6 +111,13 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 					       unsigned long len)
 {
 }
+
+static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
+					      struct vm_area_struct **prev,
+					      unsigned long start,
+					      unsigned long end)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 79a85e5bd388..2bbf32319cf5 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -19,7 +19,8 @@
  */
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
-			   UFFD_FEATURE_EVENT_REMAP)
+			   UFFD_FEATURE_EVENT_REMAP |	    \
+			   UFFD_FEATURE_EVENT_MADVDONTNEED)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -83,6 +84,11 @@ struct uffd_msg {
 			__u64	len;
 		} remap;
 
+		struct {
+			__u64	start;
+			__u64	end;
+		} madv_dn;
+
 		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
@@ -98,6 +104,7 @@ struct uffd_msg {
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
+#define UFFD_EVENT_MADVDONTNEED	0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -119,6 +126,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
+#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e3828eae9f8..06ffb5a170de 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/hugetlb.h>
 #include <linux/falloc.h>
 #include <linux/sched.h>
@@ -477,6 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	zap_page_range(vma, start, end - start, NULL);
+	madvise_userfault_dontneed(vma, prev, start, end);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7d097b4b1418d538004baa1cdce0cb9ee0a3489b Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 8 Feb 2017 10:26:12 +1100
Subject: userfaultfd: hugetlbfs: add copy_huge_page_from_user for hugetlb
 userfaultfd support

userfaultfd UFFDIO_COPY allows user level code to copy data to a page at
fault time.  The data is copied from user space to a newly allocated huge
page.  The new routine copy_huge_page_from_user performs this copy.

Link: http://lkml.kernel.org/r/20161216144821.5183-17-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/memory.c        | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index de045a48137c..87de452212e6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2420,6 +2420,9 @@ extern void clear_huge_page(struct page *page,
 extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned long addr, struct vm_area_struct *vma,
 				unsigned int pages_per_huge_page);
+extern long copy_huge_page_from_user(struct page *dst_page,
+				const void __user *usr_src,
+				unsigned int pages_per_huge_page);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
diff --git a/mm/memory.c b/mm/memory.c
index ececdc4a2892..4ade940d105c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4152,6 +4152,31 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 	}
 }
+
+long copy_huge_page_from_user(struct page *dst_page,
+				const void __user *usr_src,
+				unsigned int pages_per_huge_page)
+{
+	void *src = (void *)usr_src;
+	void *page_kaddr;
+	unsigned long i, rc = 0;
+	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+
+	for (i = 0; i < pages_per_huge_page; i++) {
+		page_kaddr = kmap_atomic(dst_page + i);
+		rc = copy_from_user(page_kaddr,
+				(const void __user *)(src + i * PAGE_SIZE),
+				PAGE_SIZE);
+		kunmap_atomic(page_kaddr);
+
+		ret_val -= (PAGE_SIZE - rc);
+		if (rc)
+			break;
+
+		cond_resched();
+	}
+	return ret_val;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
-- 
cgit v1.2.3


From d526b8790ab79b2f485424ebc7d9283696aa825b Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 8 Feb 2017 10:26:12 +1100
Subject: userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd
 support

hugetlb_mcopy_atomic_pte is the low level routine that implements the
userfaultfd UFFDIO_COPY command.  It is based on the existing
mcopy_atomic_pte routine with modifications for huge pages.

Link: http://lkml.kernel.org/r/20161216144821.5183-18-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  7 +++++
 mm/hugetlb.c            | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 48c76d612d40..aab2fff3e269 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -81,6 +81,11 @@ void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+				struct vm_area_struct *dst_vma,
+				unsigned long dst_addr,
+				unsigned long src_addr,
+				struct page **pagep);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
@@ -149,6 +154,8 @@ static inline void hugetlb_show_meminfo(void)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
+#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
+				src_addr, pagep)	({ BUG(); 0; })
 #define huge_pte_offset(mm, address)	0
 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c7025c132670..dec628b26f59 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3948,6 +3948,87 @@ out_mutex:
 	return ret;
 }
 
+/*
+ * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
+ * modifications for huge pages.
+ */
+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+			    pte_t *dst_pte,
+			    struct vm_area_struct *dst_vma,
+			    unsigned long dst_addr,
+			    unsigned long src_addr,
+			    struct page **pagep)
+{
+	struct hstate *h = hstate_vma(dst_vma);
+	pte_t _dst_pte;
+	spinlock_t *ptl;
+	int ret;
+	struct page *page;
+
+	if (!*pagep) {
+		ret = -ENOMEM;
+		page = alloc_huge_page(dst_vma, dst_addr, 0);
+		if (IS_ERR(page))
+			goto out;
+
+		ret = copy_huge_page_from_user(page,
+						(const void __user *) src_addr,
+						pages_per_huge_page(h));
+
+		/* fallback to copy_from_user outside mmap_sem */
+		if (unlikely(ret)) {
+			ret = -EFAULT;
+			*pagep = page;
+			/* don't free the page */
+			goto out;
+		}
+	} else {
+		page = *pagep;
+		*pagep = NULL;
+	}
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+	set_page_huge_active(page);
+
+	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+	spin_lock(ptl);
+
+	ret = -EEXIST;
+	if (!huge_pte_none(huge_ptep_get(dst_pte)))
+		goto out_release_unlock;
+
+	ClearPagePrivate(page);
+	hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+
+	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
+	if (dst_vma->vm_flags & VM_WRITE)
+		_dst_pte = huge_pte_mkdirty(_dst_pte);
+	_dst_pte = pte_mkyoung(_dst_pte);
+
+	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
+					dst_vma->vm_flags & VM_WRITE);
+	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+	spin_unlock(ptl);
+	ret = 0;
+out:
+	return ret;
+out_release_unlock:
+	spin_unlock(ptl);
+	put_page(page);
+	goto out;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
-- 
cgit v1.2.3


From 66a1bcd5fef93e525003c6b310e71bbded13ffd7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 8 Feb 2017 10:26:13 +1100
Subject: userfaultfd: hugetlbfs: fix __mcopy_atomic_hugetlb retry/error
 processing

The new routine copy_huge_page_from_user() uses kmap_atomic() to map
PAGE_SIZE pages.  However, this prevents page faults in the subsequent
call to copy_from_user().  This is OK in the case where the routine is
copied with mmap_sema held.  However, in another case we want to allow
page faults.  So, add a new argument allow_pagefault to indicate if the
routine should allow page faults.

Link: http://lkml.kernel.org/r/20161216144821.5183-20-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  3 ++-
 mm/hugetlb.c       |  2 +-
 mm/memory.c        | 13 ++++++++++---
 mm/userfaultfd.c   |  2 +-
 4 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 87de452212e6..4dd00d30977d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2422,7 +2422,8 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned int pages_per_huge_page);
 extern long copy_huge_page_from_user(struct page *dst_page,
 				const void __user *usr_src,
-				unsigned int pages_per_huge_page);
+				unsigned int pages_per_huge_page,
+				bool allow_pagefault);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dec628b26f59..5d20af921a30 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3973,7 +3973,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 		ret = copy_huge_page_from_user(page,
 						(const void __user *) src_addr,
-						pages_per_huge_page(h));
+						pages_per_huge_page(h), false);
 
 		/* fallback to copy_from_user outside mmap_sem */
 		if (unlikely(ret)) {
diff --git a/mm/memory.c b/mm/memory.c
index 4ade940d105c..6012a05d29f8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4155,7 +4155,8 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 
 long copy_huge_page_from_user(struct page *dst_page,
 				const void __user *usr_src,
-				unsigned int pages_per_huge_page)
+				unsigned int pages_per_huge_page,
+				bool allow_pagefault)
 {
 	void *src = (void *)usr_src;
 	void *page_kaddr;
@@ -4163,11 +4164,17 @@ long copy_huge_page_from_user(struct page *dst_page,
 	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
 
 	for (i = 0; i < pages_per_huge_page; i++) {
-		page_kaddr = kmap_atomic(dst_page + i);
+		if (allow_pagefault)
+			page_kaddr = kmap(dst_page + i);
+		else
+			page_kaddr = kmap_atomic(dst_page + i);
 		rc = copy_from_user(page_kaddr,
 				(const void __user *)(src + i * PAGE_SIZE),
 				PAGE_SIZE);
-		kunmap_atomic(page_kaddr);
+		if (allow_pagefault)
+			kunmap(page_kaddr);
+		else
+			kunmap_atomic(page_kaddr);
 
 		ret_val -= (PAGE_SIZE - rc);
 		if (rc)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ef0495bfd17a..09976745be23 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -274,7 +274,7 @@ retry:
 
 			err = copy_huge_page_from_user(page,
 						(const void __user *)src_addr,
-						pages_per_huge_page(h));
+						pages_per_huge_page(h), true);
 			if (unlikely(err)) {
 				err = -EFAULT;
 				goto out;
-- 
cgit v1.2.3


From 963e731d9f015c5928925edc196c3d0bbdbaa973 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 8 Feb 2017 10:26:15 +1100
Subject: userfaultfd: hugetlbfs: allow registration of ranges containing huge
 pages

Expand the userfaultfd_register/unregister routines to allow VM_HUGETLB
vmas.  huge page alignment checking is performed after a VM_HUGETLB vma is
encountered.

Also, since there is no UFFDIO_ZEROPAGE support for huge pages do not
return that as a valid ioctl method for huge page ranges.

Link: http://lkml.kernel.org/r/20161216144821.5183-22-aarcange@redhat.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 55 ++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/userfaultfd.h |  3 +++
 2 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 26e1ef00b63c..5139d05f80e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -27,6 +27,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ioctl.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
@@ -1058,6 +1059,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	struct uffdio_register __user *user_uffdio_register;
 	unsigned long vm_flags, new_flags;
 	bool found;
+	bool huge_pages;
 	unsigned long start, end, vma_end;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1108,6 +1110,17 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	if (vma->vm_start >= end)
 		goto out_unlock;
 
+	/*
+	 * If the first vma contains huge pages, make sure start address
+	 * is aligned to huge page size.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+		if (start & (vma_hpagesize - 1))
+			goto out_unlock;
+	}
+
 	/*
 	 * Search for not compatible vmas.
 	 *
@@ -1116,6 +1129,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	 * on anonymous vmas).
 	 */
 	found = false;
+	huge_pages = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
@@ -1124,8 +1138,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (!vma_is_anonymous(cur))
+		if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur))
 			goto out_unlock;
+		/*
+		 * If this vma contains ending address, and huge pages
+		 * check alignment.
+		 */
+		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
+		    end > cur->vm_start) {
+			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+			ret = -EINVAL;
+
+			if (end & (vma_hpagesize - 1))
+				goto out_unlock;
+		}
 
 		/*
 		 * Check that this vma isn't already owned by a
@@ -1138,6 +1165,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		    cur->vm_userfaultfd_ctx.ctx != ctx)
 			goto out_unlock;
 
+		/*
+		 * Note vmas containing huge pages
+		 */
+		if (is_vm_hugetlb_page(cur))
+			huge_pages = true;
+
 		found = true;
 	}
 	BUG_ON(!found);
@@ -1149,7 +1182,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_is_anonymous(vma));
+		BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 
@@ -1207,7 +1240,8 @@ out_unlock:
 		 * userland which ioctls methods are guaranteed to
 		 * succeed on this range.
 		 */
-		if (put_user(UFFD_API_RANGE_IOCTLS,
+		if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE :
+			     UFFD_API_RANGE_IOCTLS,
 			     &user_uffdio_register->ioctls))
 			ret = -EFAULT;
 	}
@@ -1253,6 +1287,17 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	if (vma->vm_start >= end)
 		goto out_unlock;
 
+	/*
+	 * If the first vma contains huge pages, make sure start address
+	 * is aligned to huge page size.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+		if (start & (vma_hpagesize - 1))
+			goto out_unlock;
+	}
+
 	/*
 	 * Search for not compatible vmas.
 	 *
@@ -1275,7 +1320,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (!vma_is_anonymous(cur))
+		if (!vma_is_anonymous(cur) && !is_vm_hugetlb_page(cur))
 			goto out_unlock;
 
 		found = true;
@@ -1289,7 +1334,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(!vma_is_anonymous(vma));
+		BUG_ON(!vma_is_anonymous(vma) && !is_vm_hugetlb_page(vma));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2bbf32319cf5..a3828a9bc16e 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -29,6 +29,9 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE)
+#define UFFD_API_RANGE_IOCTLS_HPAGE		\
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
-- 
cgit v1.2.3


From ad84957837be561c58cb32b9e872f3480a3f6f18 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 8 Feb 2017 10:26:16 +1100
Subject: userfaultfd: hugetlbfs: gup: support VM_FAULT_RETRY

Add support for VM_FAULT_RETRY to follow_hugetlb_page() so that
get_user_pages_unlocked/locked and "nonblocking/FOLL_NOWAIT" features will
work on hugetlbfs.  This is required for fully functional userfaultfd
non-present support on hugetlbfs.

Link: http://lkml.kernel.org/r/20161216144821.5183-25-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  5 +++--
 mm/gup.c                |  2 +-
 mm/hugetlb.c            | 48 ++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aab2fff3e269..503099d8aada 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -65,7 +65,8 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
-			 unsigned long *, unsigned long *, long, unsigned int);
+			 unsigned long *, unsigned long *, long, unsigned int,
+			 int *);
 void unmap_hugepage_range(struct vm_area_struct *,
 			  unsigned long, unsigned long, struct page *);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -136,7 +137,7 @@ static inline unsigned long hugetlb_total_pages(void)
 	return 0;
 }
 
-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w)	({ BUG(); 0; })
+#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n)	({ BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 static inline void hugetlb_report_meminfo(struct seq_file *m)
diff --git a/mm/gup.c b/mm/gup.c
index 55315555489d..40abe4c90383 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			if (is_vm_hugetlb_page(vma)) {
 				i = follow_hugetlb_page(mm, vma, pages, vmas,
 						&start, &nr_pages, i,
-						gup_flags);
+						gup_flags, nonblocking);
 				continue;
 			}
 		}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a4b29054cc3f..f6c7ff316daf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4065,7 +4065,7 @@ out_release_unlock:
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
-			 long i, unsigned int flags)
+			 long i, unsigned int flags, int *nonblocking)
 {
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
@@ -4128,16 +4128,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		    ((flags & FOLL_WRITE) &&
 		      !huge_pte_write(huge_ptep_get(pte)))) {
 			int ret;
+			unsigned int fault_flags = 0;
 
 			if (pte)
 				spin_unlock(ptl);
-			ret = hugetlb_fault(mm, vma, vaddr,
-				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
-			if (!(ret & VM_FAULT_ERROR))
-				continue;
-
-			remainder = 0;
-			break;
+			if (flags & FOLL_WRITE)
+				fault_flags |= FAULT_FLAG_WRITE;
+			if (nonblocking)
+				fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+			if (flags & FOLL_NOWAIT)
+				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+					FAULT_FLAG_RETRY_NOWAIT;
+			if (flags & FOLL_TRIED) {
+				VM_WARN_ON_ONCE(fault_flags &
+						FAULT_FLAG_ALLOW_RETRY);
+				fault_flags |= FAULT_FLAG_TRIED;
+			}
+			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+			if (ret & VM_FAULT_ERROR) {
+				remainder = 0;
+				break;
+			}
+			if (ret & VM_FAULT_RETRY) {
+				if (nonblocking)
+					*nonblocking = 0;
+				*nr_pages = 0;
+				/*
+				 * VM_FAULT_RETRY must not return an
+				 * error, it will return zero
+				 * instead.
+				 *
+				 * No need to update "position" as the
+				 * caller will not check it after
+				 * *nr_pages is set to 0.
+				 */
+				return i;
+			}
+			continue;
 		}
 
 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
@@ -4166,6 +4193,11 @@ same_page:
 		spin_unlock(ptl);
 	}
 	*nr_pages = remainder;
+	/*
+	 * setting position is actually required only if remainder is
+	 * not zero but it's faster not to add a "if (remainder)"
+	 * branch.
+	 */
 	*position = vaddr;
 
 	return i ? i : -EFAULT;
-- 
cgit v1.2.3


From 23bfcc92362fe17503d633465df7d6a40b1a145d Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 8 Feb 2017 10:26:17 +1100
Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_HUGETLBFS

Userland developers asked to be notified immediately by the UFFDIO_API
ioctl if hugetlbfs missing mode is supported by userfaultfd in the running
kernel.  This avoids the need to run UFFDIO_REGISTER on a hugetlbfs
virtual memory range to find out.

Link: http://lkml.kernel.org/r/20161216144821.5183-27-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index a3828a9bc16e..7293321abdfb 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,9 +18,10 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |	    \
-			   UFFD_FEATURE_EVENT_REMAP |	    \
-			   UFFD_FEATURE_EVENT_MADVDONTNEED)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
+			   UFFD_FEATURE_EVENT_REMAP |		\
+			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
+			   UFFD_FEATURE_MISSING_HUGETLBFS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -125,11 +126,32 @@ struct uffdio_api {
 	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
 	 * are to be considered implicitly always enabled in all kernels as
 	 * long as the uffdio_api.api requested matches UFFD_API.
+	 *
+	 * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
+	 * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
+	 * hugetlbfs virtual memory ranges. Adding or not adding
+	 * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
+	 * no real functional effect after UFFDIO_API returns, but
+	 * it's only useful for an initial feature set probe at
+	 * UFFDIO_API time. There are two ways to use it:
+	 *
+	 * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
+	 *    uffdio_api.features before calling UFFDIO_API, an error
+	 *    will be returned by UFFDIO_API on a kernel without
+	 *    hugetlbfs missing support
+	 *
+	 * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
+	 *    uffdio_api.features and instead it will be set by the
+	 *    kernel in the uffdio_api.features if the kernel supports
+	 *    it, so userland can later check if the feature flag is
+	 *    present in uffdio_api.features after UFFDIO_API
+	 *    succeeded.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 #define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
+#define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3


From 498252be305067a7077f4b7a86ee8908687286a5 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 10:26:18 +1100
Subject: userfaultfd: shmem: add shmem_mcopy_atomic_pte for userfaultfd
 support

shmem_mcopy_atomic_pte is the low level routine that implements the
userfaultfd UFFDIO_COPY command.  It is based on the existing
mcopy_atomic_pte routine with modifications for shared memory pages.

Link: http://lkml.kernel.org/r/20161216144821.5183-29-aarcange@redhat.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/shmem_fs.h |  11 +++++
 mm/shmem.c               | 110 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff078e7043b6..fdaac9d4d46d 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -124,4 +124,15 @@ static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
 }
 #endif
 
+#ifdef CONFIG_SHMEM
+extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+				  struct vm_area_struct *dst_vma,
+				  unsigned long dst_addr,
+				  unsigned long src_addr,
+				  struct page **pagep);
+#else
+#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
+			       src_addr, pagep)        ({ BUG(); 0; })
+#endif
+
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 7d52cd4b504d..14de2a9e5083 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -70,6 +70,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <uapi/linux/memfd.h>
+#include <linux/rmap.h>
 
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
@@ -2178,6 +2179,115 @@ bool shmem_mapping(struct address_space *mapping)
 	return mapping->a_ops == &shmem_aops;
 }
 
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+			   pmd_t *dst_pmd,
+			   struct vm_area_struct *dst_vma,
+			   unsigned long dst_addr,
+			   unsigned long src_addr,
+			   struct page **pagep)
+{
+	struct inode *inode = file_inode(dst_vma->vm_file);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct address_space *mapping = inode->i_mapping;
+	gfp_t gfp = mapping_gfp_mask(mapping);
+	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+	struct mem_cgroup *memcg;
+	spinlock_t *ptl;
+	void *page_kaddr;
+	struct page *page;
+	pte_t _dst_pte, *dst_pte;
+	int ret;
+
+	if (!*pagep) {
+		ret = -ENOMEM;
+		if (shmem_acct_block(info->flags, 1))
+			goto out;
+		if (sbinfo->max_blocks) {
+			if (percpu_counter_compare(&sbinfo->used_blocks,
+						   sbinfo->max_blocks) >= 0)
+				goto out_unacct_blocks;
+			percpu_counter_inc(&sbinfo->used_blocks);
+		}
+
+		page = shmem_alloc_page(gfp, info, pgoff);
+		if (!page)
+			goto out_dec_used_blocks;
+
+		page_kaddr = kmap_atomic(page);
+		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
+				     PAGE_SIZE);
+		kunmap_atomic(page_kaddr);
+
+		/* fallback to copy_from_user outside mmap_sem */
+		if (unlikely(ret)) {
+			*pagep = page;
+			/* don't free the page */
+			return -EFAULT;
+		}
+	} else {
+		page = *pagep;
+		*pagep = NULL;
+	}
+
+	ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
+	if (ret)
+		goto out_release;
+
+	ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
+	if (!ret) {
+		ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
+		radix_tree_preload_end();
+	}
+	if (ret)
+		goto out_release_uncharge;
+
+	mem_cgroup_commit_charge(page, memcg, false, false);
+
+	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+	if (dst_vma->vm_flags & VM_WRITE)
+		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+	ret = -EEXIST;
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	if (!pte_none(*dst_pte))
+		goto out_release_uncharge_unlock;
+
+	__SetPageUptodate(page);
+
+	lru_cache_add_anon(page);
+
+	spin_lock(&info->lock);
+	info->alloced++;
+	inode->i_blocks += BLOCKS_PER_PAGE;
+	shmem_recalc_inode(inode);
+	spin_unlock(&info->lock);
+
+	inc_mm_counter(dst_mm, mm_counter_file(page));
+	page_add_file_rmap(page, false);
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+	unlock_page(page);
+	pte_unmap_unlock(dst_pte, ptl);
+	ret = 0;
+out:
+	return ret;
+out_release_uncharge_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+out_release_uncharge:
+	mem_cgroup_cancel_charge(page, memcg, false);
+out_release:
+	put_page(page);
+out_dec_used_blocks:
+	if (sbinfo->max_blocks)
+		percpu_counter_add(&sbinfo->used_blocks, -1);
+out_unacct_blocks:
+	shmem_unacct_blocks(info->flags, 1);
+	goto out;
+}
+
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
-- 
cgit v1.2.3


From f615b4f9564d77fa1dd52405697c4a04d5fc773b Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 10:26:18 +1100
Subject: userfaultfd: shmem: introduce vma_is_shmem

Currently userfault relies on vma_is_anonymous and vma_is_hugetlb to
ensure compatibility of a VMA with userfault.  Introduction of
vma_is_shmem allows detection if tmpfs backed VMAs, so that they may be
used with userfaultfd.  Current implementation presumes usage of
vma_is_shmem only by slow path routines in userfaultfd, therefore the
vma_is_shmem is not made inline to leave the few remaining free bits in
vm_flags.

Link: http://lkml.kernel.org/r/20161216144821.5183-30-aarcange@redhat.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 10 ++++++++++
 mm/shmem.c         |  5 +++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4dd00d30977d..cfb51e7c15a6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1379,6 +1379,16 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
 	return !vma->vm_ops;
 }
 
+#ifdef CONFIG_SHMEM
+/*
+ * The vma_is_shmem is not inline because it is used only by slow
+ * paths in userfault.
+ */
+bool vma_is_shmem(struct vm_area_struct *vma);
+#else
+static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+#endif
+
 static inline int stack_guard_page_start(struct vm_area_struct *vma,
 					     unsigned long addr)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 14de2a9e5083..8412baec17cd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,6 +191,11 @@ static const struct inode_operations shmem_special_inode_operations;
 static const struct vm_operations_struct shmem_vm_ops;
 static struct file_system_type shmem_fs_type;
 
+bool vma_is_shmem(struct vm_area_struct *vma)
+{
+	return vma->vm_ops == &shmem_vm_ops;
+}
+
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-- 
cgit v1.2.3


From 5ee35432cc202405f024d7d5c7032366f6a88eb4 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 10:26:20 +1100
Subject: userfaultfd: shmem: allow registration of shared memory ranges

Expand the userfaultfd_register/unregister routines to allow shared memory
VMAs.  Currently, there is no UFFDIO_ZEROPAGE and write-protection support
for shared memory VMAs, which is reflected in ioctl methods supported by
uffdio_register.

Link: http://lkml.kernel.org/r/20161216144821.5183-34-aarcange@redhat.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                         | 21 +++++++--------------
 include/uapi/linux/userfaultfd.h         |  2 +-
 tools/testing/selftests/vm/userfaultfd.c |  2 +-
 3 files changed, 9 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 593135c296bc..18406158e13f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1098,7 +1098,8 @@ static __always_inline int validate_range(struct mm_struct *mm,
 
 static inline bool vma_can_userfault(struct vm_area_struct *vma)
 {
-	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma);
+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+		vma_is_shmem(vma);
 }
 
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
@@ -1111,7 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	struct uffdio_register __user *user_uffdio_register;
 	unsigned long vm_flags, new_flags;
 	bool found;
-	bool huge_pages;
+	bool non_anon_pages;
 	unsigned long start, end, vma_end;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1175,13 +1176,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 	/*
 	 * Search for not compatible vmas.
-	 *
-	 * FIXME: this shall be relaxed later so that it doesn't fail
-	 * on tmpfs backed vmas (in addition to the current allowance
-	 * on anonymous vmas).
 	 */
 	found = false;
-	huge_pages = false;
+	non_anon_pages = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
@@ -1220,8 +1217,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		/*
 		 * Note vmas containing huge pages
 		 */
-		if (is_vm_hugetlb_page(cur))
-			huge_pages = true;
+		if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
+			non_anon_pages = true;
 
 		found = true;
 	}
@@ -1292,7 +1289,7 @@ out_unlock:
 		 * userland which ioctls methods are guaranteed to
 		 * succeed on this range.
 		 */
-		if (put_user(huge_pages ? UFFD_API_RANGE_IOCTLS_HPAGE :
+		if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
 			     UFFD_API_RANGE_IOCTLS,
 			     &user_uffdio_register->ioctls))
 			ret = -EFAULT;
@@ -1352,10 +1349,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
 	/*
 	 * Search for not compatible vmas.
-	 *
-	 * FIXME: this shall be relaxed later so that it doesn't fail
-	 * on tmpfs backed vmas (in addition to the current allowance
-	 * on anonymous vmas).
 	 */
 	found = false;
 	ret = -EINVAL;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7293321abdfb..10631a4cdb24 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -30,7 +30,7 @@
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY |		\
 	 (__u64)1 << _UFFDIO_ZEROPAGE)
-#define UFFD_API_RANGE_IOCTLS_HPAGE		\
+#define UFFD_API_RANGE_IOCTLS_BASIC		\
 	((__u64)1 << _UFFDIO_WAKE |		\
 	 (__u64)1 << _UFFDIO_COPY)
 
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 3011711212ca..d753a9161411 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -129,7 +129,7 @@ static void allocate_area(void **alloc_area)
 
 #else /* HUGETLB_TEST */
 
-#define EXPECTED_IOCTLS		UFFD_API_RANGE_IOCTLS_HPAGE
+#define EXPECTED_IOCTLS		UFFD_API_RANGE_IOCTLS_BASIC
 
 static int release_pages(char *rel_area)
 {
-- 
cgit v1.2.3


From 9ff39ae172f5f8cf632e0bd5f434bb561b7222ad Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 8 Feb 2017 10:26:22 +1100
Subject: userfaultfd: hugetlbfs: UFFD_FEATURE_MISSING_SHMEM

Userland developers asked to be notified immediately by the UFFDIO_API
ioctl if shmem missing mode is supported by userfaultfd in the running
kernel. This avoids the need to run UFFDIO_REGISTER on a shmem virtual
memory range to find out.

Link: http://lkml.kernel.org/r/20161216144821.5183-38-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michael Rapoport <RAPOPORT@il.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 10631a4cdb24..9ac4b68c54d1 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -21,7 +21,8 @@
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
-			   UFFD_FEATURE_MISSING_HUGETLBFS)
+			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
+			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -146,12 +147,17 @@ struct uffdio_api {
 	 *    it, so userland can later check if the feature flag is
 	 *    present in uffdio_api.features after UFFDIO_API
 	 *    succeeded.
+	 *
+	 * UFFD_FEATURE_MISSING_SHMEM works the same as
+	 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
+	 * (i.e. tmpfs and other shmem based APIs).
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
 #define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
+#define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3


From 7bcb1d80a1dc10055e2be9471390039d78c5e64a Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:25 +1100
Subject: mm, vmscan: remove unused mm_vmscan_memcg_isolate

Patch series "vm, vmscan: enahance vmscan tracepoints", v2.

While debugging [2] I've realized that there is some room for improvements
in the tracepoints set we offer currently.  I had hard times to make any
conclusion from the existing ones.  The resulting problem turned out to be
active list aging [3] and we are missing at least two tracepoints to debug
such a problem.

Some existing tracepoints could export more information to see _why_ the
reclaim progress cannot be made not only _how much_ we could reclaim.  The
later could be seen quite reasonably from the vmstat counters already.  It
can be argued that we are showing too many implementation details in those
tracepoints but I consider them way too lowlevel already to be usable by
any kernel independent userspace.  I would be _really_ surprised if
anything but debugging tools have used them.

Any feedback is highly appreciated.

[1] http://lkml.kernel.org/r/20161228153032.10821-1-mhocko@kernel.org
[2] http://lkml.kernel.org/r/20161215225702.GA27944@boerne.fritz.box
[3] http://lkml.kernel.org/r/20161223105157.GB23109@dhcp22.suse.cz

This patch (of 8):

The trace point is not used since 925b7673cce3 ("mm: make per-memcg LRU
lists exclusive") so it can be removed.

Link: http://lkml.kernel.org/r/20170104101942.4860-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/vmscan.h | 31 +------------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index c88fd0934e7e..39bad8921ca1 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -269,8 +269,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 		__entry->retval)
 );
 
-DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
-
+TRACE_EVENT(mm_vmscan_lru_isolate,
 	TP_PROTO(int classzone_idx,
 		int order,
 		unsigned long nr_requested,
@@ -311,34 +310,6 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		__entry->file)
 );
 
-DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
-
-	TP_PROTO(int classzone_idx,
-		int order,
-		unsigned long nr_requested,
-		unsigned long nr_scanned,
-		unsigned long nr_taken,
-		isolate_mode_t isolate_mode,
-		int file),
-
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
-
-);
-
-DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
-
-	TP_PROTO(int classzone_idx,
-		int order,
-		unsigned long nr_requested,
-		unsigned long nr_scanned,
-		unsigned long nr_taken,
-		isolate_mode_t isolate_mode,
-		int file),
-
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
-
-);
-
 TRACE_EVENT(mm_vmscan_writepage,
 
 	TP_PROTO(struct page *page),
-- 
cgit v1.2.3


From 5dbdf795fb5d91483736da678a1c9c3b1e58dc97 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:25 +1100
Subject: mm, vmscan: add active list aging tracepoint

Our reclaim process has several tracepoints to tell us more about how
things are progressing.  We are, however, missing a tracepoint to track
active list aging.  Introduce mm_vmscan_lru_shrink_active which reports
the number of

	- nr_scanned, nr_taken pages to tell us the LRU isolation
	  effectiveness.
	- nr_referenced pages which tells us that we are hitting referenced
	  pages which are deactivated. If this is a large part of the
	  reported nr_deactivated pages then we might be hitting into
	  the active list too early because they might be still part of
	  the working set. This might help to debug performance issues.
	- nr_activated pages which tells us how many pages are kept on the
	  active list - mostly exec file backed pages. A high number can
	  indicate that we might be trashing on executables.

Link: http://lkml.kernel.org/r/20170104101942.4860-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/vmscan.h | 36 ++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                   | 18 ++++++++++++++----
 2 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 39bad8921ca1..087c0b625ba7 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -363,6 +363,42 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
+TRACE_EVENT(mm_vmscan_lru_shrink_active,
+
+	TP_PROTO(int nid, unsigned long nr_taken,
+		unsigned long nr_activate, unsigned long nr_deactivated,
+		unsigned long nr_referenced, int priority, int file),
+
+	TP_ARGS(nid, nr_taken, nr_activate, nr_deactivated, nr_referenced, priority, file),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(unsigned long, nr_taken)
+		__field(unsigned long, nr_activate)
+		__field(unsigned long, nr_deactivated)
+		__field(unsigned long, nr_referenced)
+		__field(int, priority)
+		__field(int, reclaim_flags)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->nr_taken = nr_taken;
+		__entry->nr_activate = nr_activate;
+		__entry->nr_deactivated = nr_deactivated;
+		__entry->nr_referenced = nr_referenced;
+		__entry->priority = priority;
+		__entry->reclaim_flags = trace_shrink_flags(file);
+	),
+
+	TP_printk("nid=%d nr_taken=%ld nr_activated=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
+		__entry->nid,
+		__entry->nr_taken,
+		__entry->nr_activate, __entry->nr_deactivated, __entry->nr_referenced,
+		__entry->priority,
+		show_reclaim_flags(__entry->reclaim_flags))
+);
+
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 532a2a750952..a34bf51d68e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1855,9 +1855,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  *
  * The downside is that we have to touch page->_refcount against each page.
  * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lru.
  */
 
-static void move_active_pages_to_lru(struct lruvec *lruvec,
+static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
 				     struct list_head *list,
 				     struct list_head *pages_to_free,
 				     enum lru_list lru)
@@ -1866,6 +1868,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 	unsigned long pgmoved = 0;
 	struct page *page;
 	int nr_pages;
+	int nr_moved = 0;
 
 	while (!list_empty(list)) {
 		page = lru_to_page(list);
@@ -1891,11 +1894,15 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 				spin_lock_irq(&pgdat->lru_lock);
 			} else
 				list_add(&page->lru, pages_to_free);
+		} else {
+			nr_moved += nr_pages;
 		}
 	}
 
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
+
+	return nr_moved;
 }
 
 static void shrink_active_list(unsigned long nr_to_scan,
@@ -1911,7 +1918,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-	unsigned long nr_rotated = 0;
+	unsigned nr_deactivate, nr_activate;
+	unsigned nr_rotated = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -1989,13 +1997,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	 */
 	reclaim_stat->recent_rotated[file] += nr_rotated;
 
-	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
-	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+	nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
+	nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&pgdat->lru_lock);
 
 	mem_cgroup_uncharge_list(&l_hold);
 	free_hot_cold_page_list(&l_hold, true);
+	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
+			nr_deactivate, nr_rotated, sc->priority, file);
 }
 
 /*
-- 
cgit v1.2.3


From f8dcf9e89d9f43fd86aea88863d265bd4dcd08b0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:25 +1100
Subject: mm-vmscan-add-active-list-aging-tracepoint-update

Our reclaim process has several tracepoints to tell us more about how
things are progressing. We are, however, missing a tracepoint to track
active list aging. Introduce mm_vmscan_lru_shrink_active which reports
the number of
	- nr_taken is number of isolated pages from the active list
	- nr_referenced pages which tells us that we are hitting referenced
	  pages which are deactivated. If this is a large part of the
	  reported nr_deactivated pages then we might be hitting into
	  the active list too early because they might be still part of
	  the working set. This might help to debug performance issues.
	- nr_active pages which tells us how many pages are kept on the
	  active list - mostly exec file backed pages. A high number can
	  indicate that we might be trashing on executables.

Link: http://lkml.kernel.org/r/20170104135244.GJ25453@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/vmscan.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 087c0b625ba7..c295d8f1b67a 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -366,15 +366,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 TRACE_EVENT(mm_vmscan_lru_shrink_active,
 
 	TP_PROTO(int nid, unsigned long nr_taken,
-		unsigned long nr_activate, unsigned long nr_deactivated,
+		unsigned long nr_active, unsigned long nr_deactivated,
 		unsigned long nr_referenced, int priority, int file),
 
-	TP_ARGS(nid, nr_taken, nr_activate, nr_deactivated, nr_referenced, priority, file),
+	TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file),
 
 	TP_STRUCT__entry(
 		__field(int, nid)
 		__field(unsigned long, nr_taken)
-		__field(unsigned long, nr_activate)
+		__field(unsigned long, nr_active)
 		__field(unsigned long, nr_deactivated)
 		__field(unsigned long, nr_referenced)
 		__field(int, priority)
@@ -384,17 +384,17 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active,
 	TP_fast_assign(
 		__entry->nid = nid;
 		__entry->nr_taken = nr_taken;
-		__entry->nr_activate = nr_activate;
+		__entry->nr_active = nr_active;
 		__entry->nr_deactivated = nr_deactivated;
 		__entry->nr_referenced = nr_referenced;
 		__entry->priority = priority;
 		__entry->reclaim_flags = trace_shrink_flags(file);
 	),
 
-	TP_printk("nid=%d nr_taken=%ld nr_activated=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
+	TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
 		__entry->nid,
 		__entry->nr_taken,
-		__entry->nr_activate, __entry->nr_deactivated, __entry->nr_referenced,
+		__entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced,
 		__entry->priority,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
-- 
cgit v1.2.3


From ebeb58d6b5e285bf2356ede643f10f81542d3ace Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:26 +1100
Subject: mm, vmscan: show the number of skipped pages in mm_vmscan_lru_isolate

mm_vmscan_lru_isolate shows the number of requested, scanned and taken
pages.  This is mostly OK but on 32b systems the number of scanned pages
is quite misleading because it includes both the scanned and skipped
pages.  Moreover the skipped part is scaled based on the number of taken
pages.  Let's report the exact numbers without any additional logic and
add the number of skipped pages.  This should make the reported data much
more easier to interpret.

Link: http://lkml.kernel.org/r/20170104101942.4860-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/vmscan.h |  8 ++++++--
 mm/vmscan.c                   | 13 +++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index c295d8f1b67a..340b3c4ef4a8 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -274,17 +274,19 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		int order,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
+		unsigned long nr_skipped,
 		unsigned long nr_taken,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
+	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, file),
 
 	TP_STRUCT__entry(
 		__field(int, classzone_idx)
 		__field(int, order)
 		__field(unsigned long, nr_requested)
 		__field(unsigned long, nr_scanned)
+		__field(unsigned long, nr_skipped)
 		__field(unsigned long, nr_taken)
 		__field(isolate_mode_t, isolate_mode)
 		__field(int, file)
@@ -295,17 +297,19 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		__entry->order = order;
 		__entry->nr_requested = nr_requested;
 		__entry->nr_scanned = nr_scanned;
+		__entry->nr_skipped = nr_skipped;
 		__entry->nr_taken = nr_taken;
 		__entry->isolate_mode = isolate_mode;
 		__entry->file = file;
 	),
 
-	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
+	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu file=%d",
 		__entry->isolate_mode,
 		__entry->classzone_idx,
 		__entry->order,
 		__entry->nr_requested,
 		__entry->nr_scanned,
+		__entry->nr_skipped,
 		__entry->nr_taken,
 		__entry->file)
 );
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a34bf51d68e2..d2fc97c6f72c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1437,6 +1437,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long nr_taken = 0;
 	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
 	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
+	unsigned long skipped = 0, total_skipped = 0;
 	unsigned long scan, nr_pages;
 	LIST_HEAD(pages_skipped);
 
@@ -1488,14 +1489,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	 */
 	if (!list_empty(&pages_skipped)) {
 		int zid;
-		unsigned long total_skipped = 0;
 
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			if (!nr_skipped[zid])
 				continue;
 
 			__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
-			total_skipped += nr_skipped[zid];
+			skipped += nr_skipped[zid];
 		}
 
 		/*
@@ -1503,13 +1503,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		 * close to unreclaimable. If the LRU list is empty, account
 		 * skipped pages as a full scan.
 		 */
-		scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+		total_skipped = list_empty(src) ? skipped : skipped >> 2;
 
 		list_splice(&pages_skipped, src);
 	}
-	*nr_scanned = scan;
-	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
-				    nr_taken, mode, is_file_lru(lru));
+	*nr_scanned = scan + total_skipped;
+	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
+				    scan, skipped, nr_taken, mode,
+				    is_file_lru(lru));
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
 	return nr_taken;
 }
-- 
cgit v1.2.3


From 03b148355cb5fd4baf0ec4ef8d1883cf90dedaa5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:26 +1100
Subject: mm, vmscan: show LRU name in mm_vmscan_lru_isolate tracepoint

mm_vmscan_lru_isolate currently prints only whether the LRU we isolate
from is file or anonymous but we do not know which LRU this is.

It is useful to know whether the list is active or inactive, since we are
using the same function to isolate pages from both of them and it's hard
to distinguish otherwise.

Link: http://lkml.kernel.org/r/20170104101942.4860-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/mmflags.h |  8 ++++++++
 include/trace/events/vmscan.h  | 12 ++++++------
 mm/vmscan.c                    |  3 +--
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 17c45bcf8f92..12cd88c86b66 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -239,6 +239,13 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	IFDEF_ZONE_HIGHMEM(	EM (ZONE_HIGHMEM,"HighMem"))	\
 				EMe(ZONE_MOVABLE,"Movable")
 
+#define LRU_NAMES		\
+		EM (LRU_INACTIVE_ANON, "inactive_anon") \
+		EM (LRU_ACTIVE_ANON, "active_anon") \
+		EM (LRU_INACTIVE_FILE, "inactive_file") \
+		EM (LRU_ACTIVE_FILE, "active_file") \
+		EMe(LRU_UNEVICTABLE, "unevictable")
+
 /*
  * First define the enums in the above macros to be exported to userspace
  * via TRACE_DEFINE_ENUM().
@@ -252,6 +259,7 @@ COMPACTION_STATUS
 COMPACTION_PRIORITY
 COMPACTION_FEEDBACK
 ZONE_TYPE
+LRU_NAMES
 
 /*
  * Now redefine the EM() and EMe() macros to map the enums to the strings
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 340b3c4ef4a8..4af0bf70e07e 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -277,9 +277,9 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		unsigned long nr_skipped,
 		unsigned long nr_taken,
 		isolate_mode_t isolate_mode,
-		int file),
+		int lru),
 
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, file),
+	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
 
 	TP_STRUCT__entry(
 		__field(int, classzone_idx)
@@ -289,7 +289,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		__field(unsigned long, nr_skipped)
 		__field(unsigned long, nr_taken)
 		__field(isolate_mode_t, isolate_mode)
-		__field(int, file)
+		__field(int, lru)
 	),
 
 	TP_fast_assign(
@@ -300,10 +300,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		__entry->nr_skipped = nr_skipped;
 		__entry->nr_taken = nr_taken;
 		__entry->isolate_mode = isolate_mode;
-		__entry->file = file;
+		__entry->lru = lru;
 	),
 
-	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu file=%d",
+	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
 		__entry->isolate_mode,
 		__entry->classzone_idx,
 		__entry->order,
@@ -311,7 +311,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
 		__entry->nr_scanned,
 		__entry->nr_skipped,
 		__entry->nr_taken,
-		__entry->file)
+		__print_symbolic(__entry->lru, LRU_NAMES))
 );
 
 TRACE_EVENT(mm_vmscan_writepage,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d2fc97c6f72c..daacfc1be867 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1509,8 +1509,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	}
 	*nr_scanned = scan + total_skipped;
 	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
-				    scan, skipped, nr_taken, mode,
-				    is_file_lru(lru));
+				    scan, skipped, nr_taken, mode, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken);
 	return nr_taken;
 }
-- 
cgit v1.2.3


From 17414d2081047cbb2098a67cd9eb63ad934ebd97 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:27 +1100
Subject: mm, vmscan: enhance mm_vmscan_lru_shrink_inactive tracepoint

mm_vmscan_lru_shrink_inactive will currently report the number of scanned
and reclaimed pages.  This doesn't give us an idea how the reclaim went
except for the overall effectiveness though.  Export and show other
counters which will tell us why we couldn't reclaim some pages.

	- nr_dirty, nr_writeback, nr_congested and nr_immediate tells
	  us how many pages are blocked due to IO
	- nr_activate tells us how many pages were moved to the active
	  list
	- nr_ref_keep reports how many pages are kept on the LRU due
	  to references (mostly for the file pages which are about to
	  go for another round through the inactive list)
	- nr_unmap_fail - how many pages failed to unmap

All these are rather low level so they might change in future but the
tracepoint is already implementation specific so no tools should be
depending on its stability.

Link: http://lkml.kernel.org/r/20170104101942.4860-7-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/vmscan.h | 29 ++++++++++++++++++++++++++---
 mm/vmscan.c                   | 14 ++++++++++++++
 2 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 4af0bf70e07e..08c1cd5af0d6 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -340,14 +340,27 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 
 	TP_PROTO(int nid,
 		unsigned long nr_scanned, unsigned long nr_reclaimed,
+		unsigned long nr_dirty, unsigned long nr_writeback,
+		unsigned long nr_congested, unsigned long nr_immediate,
+		unsigned long nr_activate, unsigned long nr_ref_keep,
+		unsigned long nr_unmap_fail,
 		int priority, int file),
 
-	TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
+	TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback,
+		nr_congested, nr_immediate, nr_activate, nr_ref_keep,
+		nr_unmap_fail, priority, file),
 
 	TP_STRUCT__entry(
 		__field(int, nid)
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_reclaimed)
+		__field(unsigned long, nr_dirty)
+		__field(unsigned long, nr_writeback)
+		__field(unsigned long, nr_congested)
+		__field(unsigned long, nr_immediate)
+		__field(unsigned long, nr_activate)
+		__field(unsigned long, nr_ref_keep)
+		__field(unsigned long, nr_unmap_fail)
 		__field(int, priority)
 		__field(int, reclaim_flags)
 	),
@@ -356,14 +369,24 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 		__entry->nid = nid;
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_reclaimed = nr_reclaimed;
+		__entry->nr_dirty = nr_dirty;
+		__entry->nr_writeback = nr_writeback;
+		__entry->nr_congested = nr_congested;
+		__entry->nr_immediate = nr_immediate;
+		__entry->nr_activate = nr_activate;
+		__entry->nr_ref_keep = nr_ref_keep;
+		__entry->nr_unmap_fail = nr_unmap_fail;
 		__entry->priority = priority;
 		__entry->reclaim_flags = trace_shrink_flags(file);
 	),
 
-	TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
+	TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
 		__entry->nid,
 		__entry->nr_scanned, __entry->nr_reclaimed,
-		__entry->priority,
+		__entry->nr_dirty, __entry->nr_writeback,
+		__entry->nr_congested, __entry->nr_immediate,
+		__entry->nr_activate, __entry->nr_ref_keep,
+		__entry->nr_unmap_fail, __entry->priority,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3e5f33b78daf..8cc90bd8149d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -918,6 +918,9 @@ struct reclaim_stat {
 	unsigned nr_congested;
 	unsigned nr_writeback;
 	unsigned nr_immediate;
+	unsigned nr_activate;
+	unsigned nr_ref_keep;
+	unsigned nr_unmap_fail;
 };
 
 /*
@@ -939,6 +942,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 	unsigned nr_reclaimed = 0;
 	unsigned nr_writeback = 0;
 	unsigned nr_immediate = 0;
+	unsigned nr_ref_keep = 0;
+	unsigned nr_unmap_fail = 0;
 
 	cond_resched();
 
@@ -1077,6 +1082,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
 		case PAGEREF_KEEP:
+			nr_ref_keep++;
 			goto keep_locked;
 		case PAGEREF_RECLAIM:
 		case PAGEREF_RECLAIM_CLEAN:
@@ -1114,6 +1120,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
 				(ttu_flags | TTU_BATCH_FLUSH))) {
 			case SWAP_FAIL:
+				nr_unmap_fail++;
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
@@ -1286,6 +1293,9 @@ keep:
 		stat->nr_unqueued_dirty = nr_unqueued_dirty;
 		stat->nr_writeback = nr_writeback;
 		stat->nr_immediate = nr_immediate;
+		stat->nr_activate = pgactivate;
+		stat->nr_ref_keep = nr_ref_keep;
+		stat->nr_unmap_fail = nr_unmap_fail;
 	}
 	return nr_reclaimed;
 }
@@ -1834,6 +1844,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
 			nr_scanned, nr_reclaimed,
+			stat.nr_dirty,  stat.nr_writeback,
+			stat.nr_congested, stat.nr_immediate,
+			stat.nr_activate, stat.nr_ref_keep,
+			stat.nr_unmap_fail,
 			sc->priority, file);
 	return nr_reclaimed;
 }
-- 
cgit v1.2.3


From 5283317920496196b13b253385f58e7bf82098f3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:27 +1100
Subject: mm, vmscan: add mm_vmscan_inactive_list_is_low tracepoint

Currently we have tracepoints for both active and inactive LRU lists
reclaim but we do not have any which would tell us why we we decided to
age the active list.  Without that it is quite hard to diagnose
active/inactive lists balancing.  Add mm_vmscan_inactive_list_is_low
tracepoint to tell us this information.

Link: http://lkml.kernel.org/r/20170104101942.4860-8-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/vmscan.h | 40 ++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                   | 23 ++++++++++++++---------
 2 files changed, 54 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 08c1cd5af0d6..27e8a5c77579 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -15,6 +15,7 @@
 #define RECLAIM_WB_MIXED	0x0010u
 #define RECLAIM_WB_SYNC		0x0004u /* Unused, all reclaim async */
 #define RECLAIM_WB_ASYNC	0x0008u
+#define RECLAIM_WB_LRU		(RECLAIM_WB_ANON|RECLAIM_WB_FILE)
 
 #define show_reclaim_flags(flags)				\
 	(flags) ? __print_flags(flags, "|",			\
@@ -426,6 +427,45 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
+TRACE_EVENT(mm_vmscan_inactive_list_is_low,
+
+	TP_PROTO(int nid, int reclaim_idx,
+		unsigned long total_inactive, unsigned long inactive,
+		unsigned long total_active, unsigned long active,
+		unsigned long ratio, int file),
+
+	TP_ARGS(nid, reclaim_idx, total_inactive, inactive, total_active, active, ratio, file),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(int, reclaim_idx)
+		__field(unsigned long, total_inactive)
+		__field(unsigned long, inactive)
+		__field(unsigned long, total_active)
+		__field(unsigned long, active)
+		__field(unsigned long, ratio)
+		__field(int, reclaim_flags)
+	),
+
+	TP_fast_assign(
+		__entry->nid = nid;
+		__entry->reclaim_idx = reclaim_idx;
+		__entry->total_inactive = total_inactive;
+		__entry->inactive = inactive;
+		__entry->total_active = total_active;
+		__entry->active = active;
+		__entry->ratio = ratio;
+		__entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU;
+	),
+
+	TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
+		__entry->nid,
+		__entry->reclaim_idx,
+		__entry->total_inactive, __entry->inactive,
+		__entry->total_active, __entry->active,
+		__entry->ratio,
+		show_reclaim_flags(__entry->reclaim_flags))
+);
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8cc90bd8149d..de400d1eac0e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2048,11 +2048,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-						struct scan_control *sc)
+						struct scan_control *sc, bool trace)
 {
 	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
+	unsigned long total_inactive, inactive;
+	unsigned long total_active, active;
 	unsigned long gb;
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	int zid;
@@ -2064,8 +2064,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	if (!file && !total_swap_pages)
 		return false;
 
-	inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-	active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
+	total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+	total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
 	/*
 	 * For zone-constrained allocations, it is necessary to check if
@@ -2092,6 +2092,11 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	else
 		inactive_ratio = 1;
 
+	if (trace)
+		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id,
+				sc->reclaim_idx,
+				total_inactive, inactive,
+				total_active, active, inactive_ratio, file);
 	return inactive * inactive_ratio < active;
 }
 
@@ -2099,7 +2104,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2230,7 +2235,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * lruvec even if it has plenty of old anonymous pages unless the
 	 * system is under heavy pressure.
 	 */
-	if (!inactive_list_is_low(lruvec, true, sc) &&
+	if (!inactive_list_is_low(lruvec, true, sc, false) &&
 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2455,7 +2460,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_list_is_low(lruvec, false, sc))
+	if (inactive_list_is_low(lruvec, false, sc, true))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 }
@@ -3105,7 +3110,7 @@ static void age_active_anon(struct pglist_data *pgdat,
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-		if (inactive_list_is_low(lruvec, false, sc))
+		if (inactive_list_is_low(lruvec, false, sc, true))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
-- 
cgit v1.2.3


From 5b74b3fba90fc408b3b193b57a5709e861f3b159 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 8 Feb 2017 10:26:29 +1100
Subject: mm: un-export wake_up_page functions

These are no longer used outside mm/filemap.c, so un-export them and make
them static where possible.  These were exported specifically for NFS use
in commit a4796e37c12e ("MM: export page_wakeup functions").

Link: http://lkml.kernel.org/r/20170103182234.30141-3-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++----------
 mm/filemap.c            | 10 ++++++++--
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 324c8dbad1e1..b572f5530392 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -482,19 +482,11 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 }
 
 /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback,
- * and for filesystems which need to wait on PG_private.
+ * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
+ * and should not be used directly.
  */
 extern void wait_on_page_bit(struct page *page, int bit_nr);
 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
-extern void wake_up_page_bit(struct page *page, int bit_nr);
-
-static inline void wake_up_page(struct page *page, int bit)
-{
-	if (!PageWaiters(page))
-		return;
-	wake_up_page_bit(page, bit);
-}
 
 /* 
  * Wait for a page to be unlocked.
diff --git a/mm/filemap.c b/mm/filemap.c
index 3f9afded581b..a3dbe9eff213 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -788,7 +788,7 @@ static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void
 	return autoremove_wake_function(wait, mode, sync, key);
 }
 
-void wake_up_page_bit(struct page *page, int bit_nr)
+static void wake_up_page_bit(struct page *page, int bit_nr)
 {
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
@@ -821,7 +821,13 @@ void wake_up_page_bit(struct page *page, int bit_nr)
 	}
 	spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL(wake_up_page_bit);
+
+static void wake_up_page(struct page *page, int bit)
+{
+	if (!PageWaiters(page))
+		return;
+	wake_up_page_bit(page, bit);
+}
 
 static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 		struct page *page, int bit_nr, int state, bool lock)
-- 
cgit v1.2.3


From 27bfa245acb735b9123262bdd0094081267c8f08 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 8 Feb 2017 10:26:30 +1100
Subject: mm, compaction: add vmstats for kcompactd work

A "compact_daemon_wake" vmstat exists that represents the number of times
kcompactd has woken up.  This doesn't represent how much work it actually
did, though.

It's useful to understand how much compaction work is being done by
kcompactd versus other methods such as direct compaction and explicitly
triggered per-node (or system) compaction.

This adds two new vmstats: "compact_daemon_migrate_scanned" and
"compact_daemon_free_scanned" to represent the number of pages kcompactd
has scanned as part of its migration scanner and freeing scanner,
respectively.

These values are still accounted for in the general
"compact_migrate_scanned" and "compact_free_scanned" for compatibility.

It could be argued that explicitly triggered compaction could also be
tracked separately, and that could be added if others find it useful.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1612071749390.69852@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vm_event_item.h |  1 +
 mm/compaction.c               | 22 +++++++++++++++++++---
 mm/internal.h                 |  2 ++
 mm/vmstat.c                   |  2 ++
 4 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 4d6ec58a8d45..6aa1b6cb5828 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -56,6 +56,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		COMPACTISOLATED,
 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 		KCOMPACTD_WAKE,
+		KCOMPACTD_MIGRATE_SCANNED, KCOMPACTD_FREE_SCANNED,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
diff --git a/mm/compaction.c b/mm/compaction.c
index 949198d01260..c6178bbd3e04 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -548,7 +548,7 @@ isolate_fail:
 	if (blockpfn == end_pfn)
 		update_pageblock_skip(cc, valid_page, total_isolated, false);
 
-	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
+	cc->total_free_scanned += nr_scanned;
 	if (total_isolated)
 		count_compact_events(COMPACTISOLATED, total_isolated);
 	return total_isolated;
@@ -931,7 +931,7 @@ isolate_fail:
 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
 						nr_scanned, nr_isolated);
 
-	count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+	cc->total_migrate_scanned += nr_scanned;
 	if (nr_isolated)
 		count_compact_events(COMPACTISOLATED, nr_isolated);
 
@@ -1631,6 +1631,9 @@ out:
 			zone->compact_cached_free_pfn = free_pfn;
 	}
 
+	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
+	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
+
 	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
 				cc->free_pfn, end_pfn, sync, ret);
 
@@ -1645,6 +1648,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
+		.total_migrate_scanned = 0,
+		.total_free_scanned = 0,
 		.order = order,
 		.gfp_mask = gfp_mask,
 		.zone = zone,
@@ -1757,6 +1762,8 @@ static void compact_node(int nid)
 	struct zone *zone;
 	struct compact_control cc = {
 		.order = -1,
+		.total_migrate_scanned = 0,
+		.total_free_scanned = 0,
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 		.whole_zone = true,
@@ -1883,6 +1890,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 	struct zone *zone;
 	struct compact_control cc = {
 		.order = pgdat->kcompactd_max_order,
+		.total_migrate_scanned = 0,
+		.total_free_scanned = 0,
 		.classzone_idx = pgdat->kcompactd_classzone_idx,
 		.mode = MIGRATE_SYNC_LIGHT,
 		.ignore_skip_hint = true,
@@ -1891,7 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 	};
 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
 							cc.classzone_idx);
-	count_vm_event(KCOMPACTD_WAKE);
+	count_compact_event(KCOMPACTD_WAKE);
 
 	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
 		int status;
@@ -1909,6 +1918,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
 		cc.nr_freepages = 0;
 		cc.nr_migratepages = 0;
+		cc.total_migrate_scanned = 0;
+		cc.total_free_scanned = 0;
 		cc.zone = zone;
 		INIT_LIST_HEAD(&cc.freepages);
 		INIT_LIST_HEAD(&cc.migratepages);
@@ -1927,6 +1938,11 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 			defer_compaction(zone, cc.order);
 		}
 
+		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
+				     cc.total_migrate_scanned);
+		count_compact_events(KCOMPACTD_FREE_SCANNED,
+				     cc.total_free_scanned);
+
 		VM_BUG_ON(!list_empty(&cc.freepages));
 		VM_BUG_ON(!list_empty(&cc.migratepages));
 	}
diff --git a/mm/internal.h b/mm/internal.h
index bfad3b5d2665..9ad04fc6eefe 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -175,6 +175,8 @@ struct compact_control {
 	struct list_head migratepages;	/* List of pages being migrated */
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
+	unsigned long total_migrate_scanned;
+	unsigned long total_free_scanned;
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c28df36f50f..69f9aff39a2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1038,6 +1038,8 @@ const char * const vmstat_text[] = {
 	"compact_fail",
 	"compact_success",
 	"compact_daemon_wake",
+	"compact_daemon_migrate_scanned",
+	"compact_daemon_free_scanned",
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3


From 0515dc0b1b881fbfa9f8ef7539b27a2a394645a5 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Wed, 8 Feb 2017 10:26:30 +1100
Subject: mm: page_alloc: skip over regions of invalid pfns where possible

When using a sparse memory model memmap_init_zone() when invoked with the
MEMMAP_EARLY context will skip over pages which aren't valid - ie.  which
aren't in a populated region of the sparse memory map.  However if the
memory map is extremely sparse then it can spend a long time linearly
checking each PFN in a large non-populated region of the memory map &
skipping it in turn.

When CONFIG_HAVE_MEMBLOCK_NODE_MAP is enabled, we have sufficient
information to quickly discover the next valid PFN given an invalid one by
searching through the list of memory regions & skipping forwards to the
first PFN covered by the memory region to the right of the non-populated
region.  Implement this in order to speed up memmap_init_zone() for
systems with extremely sparse memory maps.

James said "I have tested this patch on a virtual model of a Samurai
CPU with a sparse memory map.  The kernel boot time drops from 109 to
62 seconds. "

Link: http://lkml.kernel.org/r/20161125185518.29885-1-paul.burton@imgtec.com
Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Tested-by: James Hartley <james.hartley@imgtec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memblock.h |  1 +
 mm/memblock.c            | 25 +++++++++++++++++++++++++
 mm/page_alloc.c          | 11 ++++++++++-
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 5b759c9acf97..38bcf00cbed3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
+unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
 
 /**
  * for_each_mem_pfn_range - early memory pfn range iterator
diff --git a/mm/memblock.c b/mm/memblock.c
index 7608bc305936..a476d28e0733 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 		*out_nid = r->nid;
 }
 
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+						      unsigned long max_pfn)
+{
+	struct memblock_type *type = &memblock.memory;
+	unsigned int right = type->cnt;
+	unsigned int mid, left = 0;
+	phys_addr_t addr = PFN_PHYS(pfn + 1);
+
+	do {
+		mid = (right + left) / 2;
+
+		if (addr < type->regions[mid].base)
+			right = mid;
+		else if (addr >= (type->regions[mid].base +
+				  type->regions[mid].size))
+			left = mid + 1;
+		else {
+			/* addr is within the region, so pfn + 1 is valid */
+			return min(pfn + 1, max_pfn);
+		}
+	} while (left < right);
+
+	return min(PHYS_PFN(type->regions[right].base), max_pfn);
+}
+
 /**
  * memblock_set_node - set node ID on memblock regions
  * @base: base of area to set node ID for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05c0a59323bd..6da3169d3750 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5103,8 +5103,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 
-		if (!early_pfn_valid(pfn))
+		if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+			/*
+			 * Skip to the pfn preceding the next valid one (or
+			 * end_pfn), such that we hit a valid pfn (or end_pfn)
+			 * on our next iteration of the loop.
+			 */
+			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
 			continue;
+		}
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-- 
cgit v1.2.3


From bba816f32adf9fba278621dc25b1f71744a5176c Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Wed, 8 Feb 2017 10:26:34 +1100
Subject: powerpc: do not make the entire heap executable

On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss             NOBITS          0002aff8 01aff8 000014 00  WA  0   0  4
  [18] .plt              NOBITS          0002b00c 01aff8 000084 00 WAX  0   0  4
  [19] .bss              NOBITS          0002b090 01aff8 0000a4 00  WA  0   0  4

Which results in an ELF load header:

  Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD           0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x10000

This is all correct, the load region containing the PLT is marked as
executable.  Note that the PLT starts at 0002b00c but the file mapping
ends at 0002aff8, so the PLT falls in the 0 fill section described by the
load header, and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings.  It assumes all of
these mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header and
create 0 filled anonymous mappings that are executable if the load header
requests that.

Test program showing the difference in /proc/$PID/maps:

int main() {
	char buf[16*1024];
	char *p = malloc(123); /* make "[heap]" mapping appear */
	int fd = open("/proc/self/maps", O_RDONLY);
	int len = read(fd, buf, sizeof(buf));
	write(1, buf, len);
	printf("%p\n", p);
	return 0;
}

Compiled using: gcc -mbss-plt -m32 -Os test.c -otest

Unpatched ppc64 kernel:
00100000-00120000 r-xp 00000000 00:00 0                                  [vdso]
0fe10000-0ffd0000 r-xp 00000000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffd0000-0ffe0000 r--p 001b0000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffe0000-0fff0000 rw-p 001c0000 fd:00 67898094                           /usr/lib/libc-2.17.so
10000000-10010000 r-xp 00000000 fd:00 100674505                          /home/user/test
10010000-10020000 r--p 00000000 fd:00 100674505                          /home/user/test
10020000-10030000 rw-p 00010000 fd:00 100674505                          /home/user/test
10690000-106c0000 rwxp 00000000 00:00 0                                  [heap]
f7f70000-f7fa0000 r-xp 00000000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7fa0000-f7fb0000 r--p 00020000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7fb0000-f7fc0000 rw-p 00030000 fd:00 67898089                           /usr/lib/ld-2.17.so
ffa90000-ffac0000 rw-p 00000000 00:00 0                                  [stack]
0x10690008

Patched ppc64 kernel:
00100000-00120000 r-xp 00000000 00:00 0                                  [vdso]
0fe10000-0ffd0000 r-xp 00000000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffd0000-0ffe0000 r--p 001b0000 fd:00 67898094                           /usr/lib/libc-2.17.so
0ffe0000-0fff0000 rw-p 001c0000 fd:00 67898094                           /usr/lib/libc-2.17.so
10000000-10010000 r-xp 00000000 fd:00 100674505                          /home/user/test
10010000-10020000 r--p 00000000 fd:00 100674505                          /home/user/test
10020000-10030000 rw-p 00010000 fd:00 100674505                          /home/user/test
10180000-101b0000 rw-p 00000000 00:00 0                                  [heap]
                  ^^^^ this has changed
f7c60000-f7c90000 r-xp 00000000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7c90000-f7ca0000 r--p 00020000 fd:00 67898089                           /usr/lib/ld-2.17.so
f7ca0000-f7cb0000 rw-p 00030000 fd:00 67898089                           /usr/lib/ld-2.17.so
ff860000-ff890000 rw-p 00000000 00:00 0                                  [stack]
0x10180008

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Link: http://lkml.kernel.org/r/20161215131950.23054-1-dvlasenk@redhat.com
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/include/asm/page.h |  4 +++-
 fs/binfmt_elf.c                 | 30 ++++++++++++++++++++++--------
 include/linux/mm.h              |  1 +
 mm/mmap.c                       | 24 +++++++++++++++++++-----
 4 files changed, 45 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 47120bf2670c..2a32483c7b6c 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -230,7 +230,9 @@ extern long long virt_phys_offset;
  * and needs to be executable.  This means the whole heap ends
  * up being executable.
  */
-#define VM_DATA_DEFAULT_FLAGS32	(VM_READ | VM_WRITE | VM_EXEC | \
+#define VM_DATA_DEFAULT_FLAGS32 \
+	(((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
+				 VM_READ | VM_WRITE | \
 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define VM_DATA_DEFAULT_FLAGS64	(VM_READ | VM_WRITE | \
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 422370293cfd..cacb05644c2b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
 	start = ELF_PAGEALIGN(start);
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
-		int error = vm_brk(start, end - start);
+		/*
+		 * Map the last of the bss segment.
+		 * If the header is requesting these pages to be
+		 * executable, honour that (ppc32 needs this).
+		 */
+		int error = vm_brk_flags(start, end - start,
+				prot & PROT_EXEC ? VM_EXEC : 0);
 		if (error)
 			return error;
 	}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
 	unsigned long last_bss = 0, elf_bss = 0;
+	int bss_prot = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
 	int i;
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			 * elf_bss and last_bss is the bss section.
 			 */
 			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
-			if (k > last_bss)
+			if (k > last_bss) {
 				last_bss = k;
+				bss_prot = elf_prot;
+			}
 		}
 	}
 
@@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	/*
 	 * Next, align both the file and mem bss up to the page size,
 	 * since this is where elf_bss was just zeroed up to, and where
-	 * last_bss will end after the vm_brk() below.
+	 * last_bss will end after the vm_brk_flags() below.
 	 */
 	elf_bss = ELF_PAGEALIGN(elf_bss);
 	last_bss = ELF_PAGEALIGN(last_bss);
 	/* Finally, if there is still more bss to allocate, do it. */
 	if (last_bss > elf_bss) {
-		error = vm_brk(elf_bss, last_bss - elf_bss);
+		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
+				bss_prot & PROT_EXEC ? VM_EXEC : 0);
 		if (error)
 			goto out;
 	}
@@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	unsigned long elf_bss, elf_brk;
+	int bss_prot = 0;
 	int retval, i;
 	unsigned long elf_entry;
 	unsigned long interp_load_addr = 0;
@@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			   before this one. Map anonymous pages, if needed,
 			   and clear the area.  */
 			retval = set_brk(elf_bss + load_bias,
-					 elf_brk + load_bias);
+					 elf_brk + load_bias,
+					 bss_prot);
 			if (retval)
 				goto out_free_dentry;
 			nbyte = ELF_PAGEOFFSET(elf_bss);
@@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		if (end_data < k)
 			end_data = k;
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
-		if (k > elf_brk)
+		if (k > elf_brk) {
+			bss_prot = elf_prot;
 			elf_brk = k;
+		}
 	}
 
 	loc->elf_ex.e_entry += load_bias;
@@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	 * mapping in the interpreter, to make sure it doesn't wind
 	 * up getting placed where the bss needs to go.
 	 */
-	retval = set_brk(elf_bss, elf_brk);
+	retval = set_brk(elf_bss, elf_brk, bss_prot);
 	if (retval)
 		goto out_free_dentry;
 	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cfb51e7c15a6..e9012babd322 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2079,6 +2079,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 
 /* These take the mm semaphore themselves */
 extern int __must_check vm_brk(unsigned long, unsigned long);
+extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index dc4291dcc99b..b729084eea90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2806,11 +2806,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk(unsigned long addr, unsigned long request)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
-	unsigned long flags, len;
+	unsigned long len;
 	struct rb_node **rb_link, *rb_parent;
 	pgoff_t pgoff = addr >> PAGE_SHIFT;
 	int error;
@@ -2821,7 +2821,10 @@ static int do_brk(unsigned long addr, unsigned long request)
 	if (!len)
 		return 0;
 
-	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+	/* Until we need other flags, refuse anything except VM_EXEC. */
+	if ((flags & (~VM_EXEC)) != 0)
+		return -EINVAL;
+	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 	if (offset_in_page(error))
@@ -2889,7 +2892,12 @@ out:
 	return 0;
 }
 
-int vm_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
+{
+	return do_brk_flags(addr, len, 0);
+}
+
+int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
 	int ret;
@@ -2898,13 +2906,19 @@ int vm_brk(unsigned long addr, unsigned long len)
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_brk(addr, len);
+	ret = do_brk_flags(addr, len, flags);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
 	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
 }
+EXPORT_SYMBOL(vm_brk_flags);
+
+int vm_brk(unsigned long addr, unsigned long len)
+{
+	return vm_brk_flags(addr, len, 0);
+}
 EXPORT_SYMBOL(vm_brk);
 
 /* Release all mmaps. */
-- 
cgit v1.2.3


From 6cafec402453eabfe600b10782b5b71073fb8311 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 8 Feb 2017 10:26:35 +1100
Subject: mm/swap: add cluster lock

This patch is to reduce the lock contention of swap_info_struct->lock via
using a more fine grained lock in swap_cluster_info for some swap
operations.  swap_info_struct->lock is heavily contended if multiple
processes reclaim pages simultaneously.  Because there is only one lock
for each swap device.  While in common configuration, there is only one or
several swap devices in the system.  The lock protects almost all swap
related operations.

In fact, many swap operations only access one element of
swap_info_struct->swap_map array.  And there is no dependency between
different elements of swap_info_struct->swap_map.  So a fine grained lock
can be used to allow parallel access to the different elements of
swap_info_struct->swap_map.

In this patch, a spinlock is added to swap_cluster_info to protect the
elements of swap_info_struct->swap_map in the swap cluster and the fields
of swap_cluster_info.  This reduced locking contention for
swap_info_struct->swap_map access greatly.

Because of the added spinlock, the size of swap_cluster_info increases
from 4 bytes to 8 bytes on the 64 bit and 32 bit system.  This will use
additional 4k RAM for every 1G swap space.

Because the size of swap_cluster_info is much smaller than the size of the
cache line (8 vs 64 on x86_64 architecture), there may be false cache line
sharing between spinlocks in swap_cluster_info.  To avoid the false
sharing in the first round of the swap cluster allocation, the order of
the swap clusters in the free clusters list is changed.  So that, the
swap_cluster_info sharing the same cache line will be placed as far as
possible.  After the first round of allocation, the order of the clusters
in free clusters list is expected to be random.  So the false sharing
should be not serious.

Compared with a previous implementation using bit_spin_lock, the
sequential swap out throughput improved about 3.2%.  Test was done on a
Xeon E5 v3 system.  The swap device used is a RAM simulated PMEM
(persistent memory) device.  To test the sequential swapping out, the test
case created 32 processes, which sequentially allocate and write to the
anonymous pages until the RAM and part of the swap device is used.

Link: http://lkml.kernel.org/r/dbb860bbd825b1aaba18988015e8963f263c3f0d.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  13 ++-
 mm/swapfile.c        | 239 +++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 194 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7f47b7098b1b..e81608a0302a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,11 +176,16 @@ enum {
  * protected by swap_info_struct.lock.
  */
 struct swap_cluster_info {
-	unsigned int data:24;
-	unsigned int flags:8;
+	unsigned long data;
 };
-#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
-#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_COUNT_SHIFT		8
+#define CLUSTER_FLAG_MASK		((1UL << CLUSTER_COUNT_SHIFT) - 1)
+#define CLUSTER_COUNT_MASK		(~CLUSTER_FLAG_MASK)
+#define CLUSTER_FLAG_FREE		1 /* This cluster is free */
+#define CLUSTER_FLAG_NEXT_NULL		2 /* This cluster has no next cluster */
+/* cluster lock, protect cluster_info contents and sis->swap_map */
+#define CLUSTER_FLAG_LOCK_BIT		2
+#define CLUSTER_FLAG_LOCK		(1 << CLUSTER_FLAG_LOCK_BIT)
 
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2001ce427a1d..10023b45e272 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -200,61 +200,107 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 #define LATENCY_LIMIT		256
 
 static inline void cluster_set_flag(struct swap_cluster_info *info,
-	unsigned int flag)
+				    unsigned int flag)
 {
-	info->flags = flag;
+	info->data = (info->data & (CLUSTER_COUNT_MASK | CLUSTER_FLAG_LOCK)) |
+		(flag & ~CLUSTER_FLAG_LOCK);
 }
 
 static inline unsigned int cluster_count(struct swap_cluster_info *info)
 {
-	return info->data;
+	return info->data >> CLUSTER_COUNT_SHIFT;
 }
 
 static inline void cluster_set_count(struct swap_cluster_info *info,
 				     unsigned int c)
 {
-	info->data = c;
+	info->data = (c << CLUSTER_COUNT_SHIFT) | (info->data & CLUSTER_FLAG_MASK);
 }
 
 static inline void cluster_set_count_flag(struct swap_cluster_info *info,
 					 unsigned int c, unsigned int f)
 {
-	info->flags = f;
-	info->data = c;
+	info->data = (info->data & CLUSTER_FLAG_LOCK) |
+		(c << CLUSTER_COUNT_SHIFT) | (f & ~CLUSTER_FLAG_LOCK);
 }
 
 static inline unsigned int cluster_next(struct swap_cluster_info *info)
 {
-	return info->data;
+	return cluster_count(info);
 }
 
 static inline void cluster_set_next(struct swap_cluster_info *info,
 				    unsigned int n)
 {
-	info->data = n;
+	cluster_set_count(info, n);
 }
 
 static inline void cluster_set_next_flag(struct swap_cluster_info *info,
 					 unsigned int n, unsigned int f)
 {
-	info->flags = f;
-	info->data = n;
+	cluster_set_count_flag(info, n, f);
 }
 
 static inline bool cluster_is_free(struct swap_cluster_info *info)
 {
-	return info->flags & CLUSTER_FLAG_FREE;
+	return info->data & CLUSTER_FLAG_FREE;
 }
 
 static inline bool cluster_is_null(struct swap_cluster_info *info)
 {
-	return info->flags & CLUSTER_FLAG_NEXT_NULL;
+	return info->data & CLUSTER_FLAG_NEXT_NULL;
 }
 
 static inline void cluster_set_null(struct swap_cluster_info *info)
 {
-	info->flags = CLUSTER_FLAG_NEXT_NULL;
-	info->data = 0;
+	cluster_set_next_flag(info, 0, CLUSTER_FLAG_NEXT_NULL);
+}
+
+/* Protect swap_cluster_info fields and si->swap_map */
+static inline void __lock_cluster(struct swap_cluster_info *ci)
+{
+	bit_spin_lock(CLUSTER_FLAG_LOCK_BIT, &ci->data);
+}
+
+static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
+						     unsigned long offset)
+{
+	struct swap_cluster_info *ci;
+
+	ci = si->cluster_info;
+	if (ci) {
+		ci += offset / SWAPFILE_CLUSTER;
+		__lock_cluster(ci);
+	}
+	return ci;
+}
+
+static inline void unlock_cluster(struct swap_cluster_info *ci)
+{
+	if (ci)
+		bit_spin_unlock(CLUSTER_FLAG_LOCK_BIT, &ci->data);
+}
+
+static inline struct swap_cluster_info *lock_cluster_or_swap_info(
+	struct swap_info_struct *si,
+	unsigned long offset)
+{
+	struct swap_cluster_info *ci;
+
+	ci = lock_cluster(si, offset);
+	if (!ci)
+		spin_lock(&si->lock);
+
+	return ci;
+}
+
+static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+					       struct swap_cluster_info *ci)
+{
+	if (ci)
+		unlock_cluster(ci);
+	else
+		spin_unlock(&si->lock);
 }
 
 static inline bool cluster_list_empty(struct swap_cluster_list *list)
@@ -281,9 +327,17 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
 		cluster_set_next_flag(&list->head, idx, 0);
 		cluster_set_next_flag(&list->tail, idx, 0);
 	} else {
+		struct swap_cluster_info *ci_tail;
 		unsigned int tail = cluster_next(&list->tail);
 
-		cluster_set_next(&ci[tail], idx);
+		/*
+		 * Nested cluster lock, but both cluster locks are
+		 * only acquired when we held swap_info_struct->lock
+		 */
+		ci_tail = ci + tail;
+		__lock_cluster(ci_tail);
+		cluster_set_next(ci_tail, idx);
+		unlock_cluster(ci_tail);
 		cluster_set_next_flag(&list->tail, idx, 0);
 	}
 }
@@ -328,7 +382,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 */
 static void swap_do_scheduled_discard(struct swap_info_struct *si)
 {
-	struct swap_cluster_info *info;
+	struct swap_cluster_info *info, *ci;
 	unsigned int idx;
 
 	info = si->cluster_info;
@@ -341,10 +395,14 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
 				SWAPFILE_CLUSTER);
 
 		spin_lock(&si->lock);
-		cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+		cluster_set_flag(ci, CLUSTER_FLAG_FREE);
+		unlock_cluster(ci);
 		cluster_list_add_tail(&si->free_clusters, info, idx);
+		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 				0, SWAPFILE_CLUSTER);
+		unlock_cluster(ci);
 	}
 }
 
@@ -447,8 +505,9 @@ static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	unsigned long *offset, unsigned long *scan_base)
 {
 	struct percpu_cluster *cluster;
+	struct swap_cluster_info *ci;
 	bool found_free;
-	unsigned long tmp;
+	unsigned long tmp, max;
 
 new_cluster:
 	cluster = this_cpu_ptr(si->percpu_cluster);
@@ -476,14 +535,21 @@ new_cluster:
 	 * check if there is still free entry in the cluster
 	 */
 	tmp = cluster->next;
-	while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
-	       SWAPFILE_CLUSTER) {
+	max = min_t(unsigned long, si->max,
+		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+	if (tmp >= max) {
+		cluster_set_null(&cluster->index);
+		goto new_cluster;
+	}
+	ci = lock_cluster(si, tmp);
+	while (tmp < max) {
 		if (!si->swap_map[tmp]) {
 			found_free = true;
 			break;
 		}
 		tmp++;
 	}
+	unlock_cluster(ci);
 	if (!found_free) {
 		cluster_set_null(&cluster->index);
 		goto new_cluster;
@@ -496,6 +562,7 @@ new_cluster:
 static unsigned long scan_swap_map(struct swap_info_struct *si,
 				   unsigned char usage)
 {
+	struct swap_cluster_info *ci;
 	unsigned long offset;
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
@@ -572,9 +639,11 @@ checks:
 	if (offset > si->highest_bit)
 		scan_base = offset = si->lowest_bit;
 
+	ci = lock_cluster(si, offset);
 	/* reuse swap entry of cache-only swap if not busy. */
 	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
+		unlock_cluster(ci);
 		spin_unlock(&si->lock);
 		swap_was_freed = __try_to_reclaim_swap(si, offset);
 		spin_lock(&si->lock);
@@ -584,8 +653,10 @@ checks:
 		goto scan; /* check next one */
 	}
 
-	if (si->swap_map[offset])
+	if (si->swap_map[offset]) {
+		unlock_cluster(ci);
 		goto scan;
+	}
 
 	if (offset == si->lowest_bit)
 		si->lowest_bit++;
@@ -601,6 +672,7 @@ checks:
 	}
 	si->swap_map[offset] = usage;
 	inc_cluster_info_page(si, si->cluster_info, offset);
+	unlock_cluster(ci);
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
 
@@ -731,7 +803,7 @@ swp_entry_t get_swap_page_of_type(int type)
 	return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	unsigned long offset, type;
@@ -749,7 +821,6 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 		goto bad_offset;
 	if (!p->swap_map[offset])
 		goto bad_free;
-	spin_lock(&p->lock);
 	return p;
 
 bad_free:
@@ -767,14 +838,45 @@ out:
 	return NULL;
 }
 
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+
+	p = _swap_info_get(entry);
+	if (p)
+		spin_lock(&p->lock);
+	return p;
+}
+
 static unsigned char swap_entry_free(struct swap_info_struct *p,
-				     swp_entry_t entry, unsigned char usage)
+				     swp_entry_t entry, unsigned char usage,
+				     bool swap_info_locked)
 {
+	struct swap_cluster_info *ci;
 	unsigned long offset = swp_offset(entry);
 	unsigned char count;
 	unsigned char has_cache;
+	bool lock_swap_info = false;
+
+	if (!swap_info_locked) {
+		count = p->swap_map[offset];
+		if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) {
+lock_swap_info:
+			swap_info_locked = true;
+			lock_swap_info = true;
+			spin_lock(&p->lock);
+		}
+	}
+
+	ci = lock_cluster(p, offset);
 
 	count = p->swap_map[offset];
+
+	if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) {
+		unlock_cluster(ci);
+		goto lock_swap_info;
+	}
+
 	has_cache = count & SWAP_HAS_CACHE;
 	count &= ~SWAP_HAS_CACHE;
 
@@ -800,10 +902,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 	usage = count | has_cache;
 	p->swap_map[offset] = usage;
 
+	unlock_cluster(ci);
+
 	/* free if no reference */
 	if (!usage) {
+		VM_BUG_ON(!swap_info_locked);
 		mem_cgroup_uncharge_swap(entry);
+		ci = lock_cluster(p, offset);
 		dec_cluster_info_page(p, p->cluster_info, offset);
+		unlock_cluster(ci);
 		if (offset < p->lowest_bit)
 			p->lowest_bit = offset;
 		if (offset > p->highest_bit) {
@@ -829,6 +936,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 		}
 	}
 
+	if (lock_swap_info)
+		spin_unlock(&p->lock);
+
 	return usage;
 }
 
@@ -840,11 +950,9 @@ void swap_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 
-	p = swap_info_get(entry);
-	if (p) {
-		swap_entry_free(p, entry, 1);
-		spin_unlock(&p->lock);
-	}
+	p = _swap_info_get(entry);
+	if (p)
+		swap_entry_free(p, entry, 1, false);
 }
 
 /*
@@ -854,11 +962,9 @@ void swapcache_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 
-	p = swap_info_get(entry);
-	if (p) {
-		swap_entry_free(p, entry, SWAP_HAS_CACHE);
-		spin_unlock(&p->lock);
-	}
+	p = _swap_info_get(entry);
+	if (p)
+		swap_entry_free(p, entry, SWAP_HAS_CACHE, false);
 }
 
 /*
@@ -870,13 +976,17 @@ int page_swapcount(struct page *page)
 {
 	int count = 0;
 	struct swap_info_struct *p;
+	struct swap_cluster_info *ci;
 	swp_entry_t entry;
+	unsigned long offset;
 
 	entry.val = page_private(page);
-	p = swap_info_get(entry);
+	p = _swap_info_get(entry);
 	if (p) {
-		count = swap_count(p->swap_map[swp_offset(entry)]);
-		spin_unlock(&p->lock);
+		offset = swp_offset(entry);
+		ci = lock_cluster_or_swap_info(p, offset);
+		count = swap_count(p->swap_map[offset]);
+		unlock_cluster_or_swap_info(p, ci);
 	}
 	return count;
 }
@@ -889,22 +999,26 @@ int swp_swapcount(swp_entry_t entry)
 {
 	int count, tmp_count, n;
 	struct swap_info_struct *p;
+	struct swap_cluster_info *ci;
 	struct page *page;
 	pgoff_t offset;
 	unsigned char *map;
 
-	p = swap_info_get(entry);
+	p = _swap_info_get(entry);
 	if (!p)
 		return 0;
 
-	count = swap_count(p->swap_map[swp_offset(entry)]);
+	offset = swp_offset(entry);
+
+	ci = lock_cluster_or_swap_info(p, offset);
+
+	count = swap_count(p->swap_map[offset]);
 	if (!(count & COUNT_CONTINUED))
 		goto out;
 
 	count &= ~COUNT_CONTINUED;
 	n = SWAP_MAP_MAX + 1;
 
-	offset = swp_offset(entry);
 	page = vmalloc_to_page(p->swap_map + offset);
 	offset &= ~PAGE_MASK;
 	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -919,7 +1033,7 @@ int swp_swapcount(swp_entry_t entry)
 		n *= (SWAP_CONT_MAX + 1);
 	} while (tmp_count & COUNT_CONTINUED);
 out:
-	spin_unlock(&p->lock);
+	unlock_cluster_or_swap_info(p, ci);
 	return count;
 }
 
@@ -1017,7 +1131,7 @@ int free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
+		if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
@@ -2298,6 +2412,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	return maxpages;
 }
 
+#define SWAP_CLUSTER_COLS						\
+	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					union swap_header *swap_header,
 					unsigned char *swap_map,
@@ -2305,11 +2422,12 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					unsigned long maxpages,
 					sector_t *span)
 {
-	int i;
+	unsigned int j, k;
 	unsigned int nr_good_pages;
 	int nr_extents;
 	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
-	unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
+	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+	unsigned long i, idx;
 
 	nr_good_pages = maxpages - 1;	/* omit header page */
 
@@ -2357,15 +2475,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	if (!cluster_info)
 		return nr_extents;
 
-	for (i = 0; i < nr_clusters; i++) {
-		if (!cluster_count(&cluster_info[idx])) {
+
+	/* Reduce false cache line sharing between cluster_info */
+	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
+		j = (k + col) % SWAP_CLUSTER_COLS;
+		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+			idx = i * SWAP_CLUSTER_COLS + j;
+			if (idx >= nr_clusters)
+				continue;
+			if (cluster_count(&cluster_info[idx]))
+				continue;
 			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
 			cluster_list_add_tail(&p->free_clusters, cluster_info,
 					      idx);
 		}
-		idx++;
-		if (idx == nr_clusters)
-			idx = 0;
 	}
 	return nr_extents;
 }
@@ -2627,6 +2750,7 @@ void si_swapinfo(struct sysinfo *val)
 static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
 	struct swap_info_struct *p;
+	struct swap_cluster_info *ci;
 	unsigned long offset, type;
 	unsigned char count;
 	unsigned char has_cache;
@@ -2640,10 +2764,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 		goto bad_file;
 	p = swap_info[type];
 	offset = swp_offset(entry);
-
-	spin_lock(&p->lock);
 	if (unlikely(offset >= p->max))
-		goto unlock_out;
+		goto out;
+
+	ci = lock_cluster_or_swap_info(p, offset);
 
 	count = p->swap_map[offset];
 
@@ -2686,7 +2810,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 	p->swap_map[offset] = count | has_cache;
 
 unlock_out:
-	spin_unlock(&p->lock);
+	unlock_cluster_or_swap_info(p, ci);
 out:
 	return err;
 
@@ -2775,6 +2899,7 @@ EXPORT_SYMBOL_GPL(__page_file_index);
 int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 {
 	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
 	struct page *head;
 	struct page *page;
 	struct page *list_page;
@@ -2798,6 +2923,9 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	}
 
 	offset = swp_offset(entry);
+
+	ci = lock_cluster(si, offset);
+
 	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
 
 	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
@@ -2810,6 +2938,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	}
 
 	if (!page) {
+		unlock_cluster(ci);
 		spin_unlock(&si->lock);
 		return -ENOMEM;
 	}
@@ -2858,6 +2987,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
 	list_add_tail(&page->lru, &head->lru);
 	page = NULL;			/* now it's attached, don't free it */
 out:
+	unlock_cluster(ci);
 	spin_unlock(&si->lock);
 outer:
 	if (page)
@@ -2871,7 +3001,8 @@ outer:
  * into, carry if so, or else fail until a new continuation page is allocated;
  * when the original swap_map count is decremented from 0 with continuation,
  * borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
+ * lock.
  */
 static bool swap_count_continued(struct swap_info_struct *si,
 				 pgoff_t offset, unsigned char count)
-- 
cgit v1.2.3


From 063a2a8889e984aea9f3ac81f5f63289b204f985 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 8 Feb 2017 10:26:35 +1100
Subject: mm-swap-add-cluster-lock-v5

Link: http://lkml.kernel.org/r/878tqeuuic.fsf_-_@yhuang-dev.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 19 ++++++++++---------
 mm/swapfile.c        | 32 ++++++++++++++++----------------
 2 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e81608a0302a..279c7b84bd63 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -176,16 +176,17 @@ enum {
  * protected by swap_info_struct.lock.
  */
 struct swap_cluster_info {
-	unsigned long data;
+	spinlock_t lock;	/*
+				 * Protect swap_cluster_info fields
+				 * and swap_info_struct->swap_map
+				 * elements correspond to the swap
+				 * cluster
+				 */
+	unsigned int data:24;
+	unsigned int flags:8;
 };
-#define CLUSTER_COUNT_SHIFT		8
-#define CLUSTER_FLAG_MASK		((1UL << CLUSTER_COUNT_SHIFT) - 1)
-#define CLUSTER_COUNT_MASK		(~CLUSTER_FLAG_MASK)
-#define CLUSTER_FLAG_FREE		1 /* This cluster is free */
-#define CLUSTER_FLAG_NEXT_NULL		2 /* This cluster has no next cluster */
-/* cluster lock, protect cluster_info contents and sis->swap_map */
-#define CLUSTER_FLAG_LOCK_BIT		2
-#define CLUSTER_FLAG_LOCK		(1 << CLUSTER_FLAG_LOCK_BIT)
+#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
 
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 10023b45e272..d0a020e99df7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -200,66 +200,66 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 #define LATENCY_LIMIT		256
 
 static inline void cluster_set_flag(struct swap_cluster_info *info,
-				    unsigned int flag)
+	unsigned int flag)
 {
-	info->data = (info->data & (CLUSTER_COUNT_MASK | CLUSTER_FLAG_LOCK)) |
-		(flag & ~CLUSTER_FLAG_LOCK);
+	info->flags = flag;
 }
 
 static inline unsigned int cluster_count(struct swap_cluster_info *info)
 {
-	return info->data >> CLUSTER_COUNT_SHIFT;
+	return info->data;
 }
 
 static inline void cluster_set_count(struct swap_cluster_info *info,
 				     unsigned int c)
 {
-	info->data = (c << CLUSTER_COUNT_SHIFT) | (info->data & CLUSTER_FLAG_MASK);
+	info->data = c;
 }
 
 static inline void cluster_set_count_flag(struct swap_cluster_info *info,
 					 unsigned int c, unsigned int f)
 {
-	info->data = (info->data & CLUSTER_FLAG_LOCK) |
-		(c << CLUSTER_COUNT_SHIFT) | (f & ~CLUSTER_FLAG_LOCK);
+	info->flags = f;
+	info->data = c;
 }
 
 static inline unsigned int cluster_next(struct swap_cluster_info *info)
 {
-	return cluster_count(info);
+	return info->data;
 }
 
 static inline void cluster_set_next(struct swap_cluster_info *info,
 				    unsigned int n)
 {
-	cluster_set_count(info, n);
+	info->data = n;
 }
 
 static inline void cluster_set_next_flag(struct swap_cluster_info *info,
 					 unsigned int n, unsigned int f)
 {
-	cluster_set_count_flag(info, n, f);
+	info->flags = f;
+	info->data = n;
 }
 
 static inline bool cluster_is_free(struct swap_cluster_info *info)
 {
-	return info->data & CLUSTER_FLAG_FREE;
+	return info->flags & CLUSTER_FLAG_FREE;
 }
 
 static inline bool cluster_is_null(struct swap_cluster_info *info)
 {
-	return info->data & CLUSTER_FLAG_NEXT_NULL;
+	return info->flags & CLUSTER_FLAG_NEXT_NULL;
 }
 
 static inline void cluster_set_null(struct swap_cluster_info *info)
 {
-	cluster_set_next_flag(info, 0, CLUSTER_FLAG_NEXT_NULL);
+	info->flags = CLUSTER_FLAG_NEXT_NULL;
+	info->data = 0;
 }
 
-/* Protect swap_cluster_info fields and si->swap_map */
 static inline void __lock_cluster(struct swap_cluster_info *ci)
 {
-	bit_spin_lock(CLUSTER_FLAG_LOCK_BIT, &ci->data);
+	spin_lock(&ci->lock);
 }
 
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
@@ -278,7 +278,7 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si
 static inline void unlock_cluster(struct swap_cluster_info *ci)
 {
 	if (ci)
-		bit_spin_unlock(CLUSTER_FLAG_LOCK_BIT, &ci->data);
+		spin_unlock(&ci->lock);
 }
 
 static inline struct swap_cluster_info *lock_cluster_or_swap_info(
-- 
cgit v1.2.3


From c5a83605197a163e8569cc9ac9b1fec066008b13 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Wed, 8 Feb 2017 10:26:36 +1100
Subject: mm/swap: split swap cache into 64MB trunks

The patch is to improve the scalability of the swap out/in via using fine
grained locks for the swap cache.  In current kernel, one address space
will be used for each swap device.  And in the common configuration, the
number of the swap device is very small (one is typical).  This causes the
heavy lock contention on the radix tree of the address space if multiple
tasks swap out/in concurrently.  But in fact, there is no dependency
between pages in the swap cache.  So that, we can split the one shared
address space for each swap device into several address spaces to reduce
the lock contention.  In the patch, the shared address space is split into
64MB trunks.  64MB is chosen to balance the memory space usage and effect
of lock contention reduction.

The size of struct address_space on x86_64 architecture is 408B, so with
the patch, 6528B more memory will be used for every 1GB swap space on
x86_64 architecture.

One address space is still shared for the swap entries in the same 64M
trunks.  To avoid lock contention for the first round of swap space
allocation, the order of the swap clusters in the initial free clusters
list is changed.  The swap space distance between the consecutive swap
clusters in the free cluster list is at least 64M.  After the first round
of allocation, the swap clusters are expected to be freed randomly, so the
lock contention should be reduced effectively.

Link: http://lkml.kernel.org/r/735bab895e64c930581ffb0a05b661e01da82bc5.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 11 +++++++--
 mm/swap.c            |  6 -----
 mm/swap_state.c      | 68 ++++++++++++++++++++++++++++++++++++++++++----------
 mm/swapfile.c        | 16 +++++++++++--
 4 files changed, 79 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 279c7b84bd63..648a32b58e3e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -343,8 +343,13 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 		sector_t *);
 
 /* linux/mm/swap_state.c */
-extern struct address_space swapper_spaces[];
-#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
+/* One swap address space for each 64M swap space */
+#define SWAP_ADDRESS_SPACE_SHIFT	14
+#define SWAP_ADDRESS_SPACE_PAGES	(1 << SWAP_ADDRESS_SPACE_SHIFT)
+extern struct address_space *swapper_spaces[];
+#define swap_address_space(entry)			    \
+	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
+		>> SWAP_ADDRESS_SPACE_SHIFT])
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *, struct list_head *list);
@@ -398,6 +403,8 @@ extern struct swap_info_struct *page_swap_info(struct page *);
 extern bool reuse_swap_page(struct page *, int *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
+extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
+extern void exit_swap_address_space(unsigned int type);
 
 #else /* CONFIG_SWAP */
 
diff --git a/mm/swap.c b/mm/swap.c
index 844baedd2429..aabf2e90fe32 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -971,12 +971,6 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 void __init swap_setup(void)
 {
 	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
-#ifdef CONFIG_SWAP
-	int i;
-
-	for (i = 0; i < MAX_SWAPFILES; i++)
-		spin_lock_init(&swapper_spaces[i].tree_lock);
-#endif
 
 	/* Use a smaller cluster for small-memory machines */
 	if (megs < 16)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 35d7e0ee1c77..3863acd6189c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/vmalloc.h>
 
 #include <asm/pgtable.h>
 
@@ -32,15 +33,8 @@ static const struct address_space_operations swap_aops = {
 #endif
 };
 
-struct address_space swapper_spaces[MAX_SWAPFILES] = {
-	[0 ... MAX_SWAPFILES - 1] = {
-		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-		.i_mmap_writable = ATOMIC_INIT(0),
-		.a_ops		= &swap_aops,
-		/* swap cache doesn't use writeback related tags */
-		.flags		= 1 << AS_NO_WRITEBACK_TAGS,
-	}
-};
+struct address_space *swapper_spaces[MAX_SWAPFILES];
+static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 
@@ -53,11 +47,26 @@ static struct {
 
 unsigned long total_swapcache_pages(void)
 {
-	int i;
+	unsigned int i, j, nr;
 	unsigned long ret = 0;
+	struct address_space *spaces;
 
-	for (i = 0; i < MAX_SWAPFILES; i++)
-		ret += swapper_spaces[i].nrpages;
+	rcu_read_lock();
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		/*
+		 * The corresponding entries in nr_swapper_spaces and
+		 * swapper_spaces will be reused only after at least
+		 * one grace period.  So it is impossible for them
+		 * belongs to different usage.
+		 */
+		nr = nr_swapper_spaces[i];
+		spaces = rcu_dereference(swapper_spaces[i]);
+		if (!nr || !spaces)
+			continue;
+		for (j = 0; j < nr; j++)
+			ret += spaces[j].nrpages;
+	}
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -505,3 +514,38 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 skip:
 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
+
+int init_swap_address_space(unsigned int type, unsigned long nr_pages)
+{
+	struct address_space *spaces, *space;
+	unsigned int i, nr;
+
+	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
+	spaces = vzalloc(sizeof(struct address_space) * nr);
+	if (!spaces)
+		return -ENOMEM;
+	for (i = 0; i < nr; i++) {
+		space = spaces + i;
+		INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
+		atomic_set(&space->i_mmap_writable, 0);
+		space->a_ops = &swap_aops;
+		/* swap cache doesn't use writeback related tags */
+		mapping_set_no_writeback_tags(space);
+		spin_lock_init(&space->tree_lock);
+	}
+	nr_swapper_spaces[type] = nr;
+	rcu_assign_pointer(swapper_spaces[type], spaces);
+
+	return 0;
+}
+
+void exit_swap_address_space(unsigned int type)
+{
+	struct address_space *spaces;
+
+	spaces = swapper_spaces[type];
+	nr_swapper_spaces[type] = 0;
+	rcu_assign_pointer(swapper_spaces[type], NULL);
+	synchronize_rcu();
+	kvfree(spaces);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 361bc4cd0cd8..052713e4911a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2089,6 +2089,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	vfree(frontswap_map);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
+	exit_swap_address_space(p->type);
 
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
@@ -2412,8 +2413,12 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	return maxpages;
 }
 
-#define SWAP_CLUSTER_COLS						\
+#define SWAP_CLUSTER_INFO_COLS						\
 	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
+#define SWAP_CLUSTER_SPACE_COLS						\
+	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
+#define SWAP_CLUSTER_COLS						\
+	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
 
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					union swap_header *swap_header,
@@ -2476,7 +2481,10 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 		return nr_extents;
 
 
-	/* Reduce false cache line sharing between cluster_info */
+	/*
+	 * Reduce false cache line sharing between cluster_info and
+	 * sharing same address space.
+	 */
 	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
 		j = (k + col) % SWAP_CLUSTER_COLS;
 		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
@@ -2666,6 +2674,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		}
 	}
 
+	error = init_swap_address_space(p->type, maxpages);
+	if (error)
+		goto bad_swap;
+
 	mutex_lock(&swapon_mutex);
 	prio = -1;
 	if (swap_flags & SWAP_FLAG_PREFER)
-- 
cgit v1.2.3


From 9a72a464445524c858476f472c004e12bda29b33 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:37 +1100
Subject: mm/swap: skip readahead for unreferenced swap slots

We can avoid needlessly allocating page for swap slots that are not used
by anyone.  No pages have to be read in for these slots.

Link: http://lkml.kernel.org/r/0784b3f20b9bd3aa5552219624cb78dc4ae710c9.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  6 ++++++
 mm/swap_state.c      |  4 ++++
 mm/swapfile.c        | 47 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 51 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 648a32b58e3e..3116382067cd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -398,6 +398,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
+extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern bool reuse_swap_page(struct page *, int *);
@@ -492,6 +493,11 @@ static inline int page_swapcount(struct page *page)
 	return 0;
 }
 
+static inline int __swp_swapcount(swp_entry_t entry)
+{
+	return 0;
+}
+
 static inline int swp_swapcount(swp_entry_t entry)
 {
 	return 0;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3863acd6189c..3d76d80c07d6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -323,6 +323,10 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (found_page)
 			break;
 
+		/* Just skip read ahead for unused swap slot */
+		if (!__swp_swapcount(entry))
+			return NULL;
+
 		/*
 		 * Get a new page to read into from swap.
 		 */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 052713e4911a..2dd10f7544dd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -803,7 +803,7 @@ swp_entry_t get_swap_page_of_type(int type)
 	return (swp_entry_t) {0};
 }
 
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	unsigned long offset, type;
@@ -819,13 +819,8 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
 	offset = swp_offset(entry);
 	if (offset >= p->max)
 		goto bad_offset;
-	if (!p->swap_map[offset])
-		goto bad_free;
 	return p;
 
-bad_free:
-	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
-	goto out;
 bad_offset:
 	pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
 	goto out;
@@ -838,6 +833,24 @@ out:
 	return NULL;
 }
 
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+
+	p = __swap_info_get(entry);
+	if (!p)
+		goto out;
+	if (!p->swap_map[swp_offset(entry)])
+		goto bad_free;
+	return p;
+
+bad_free:
+	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+	goto out;
+out:
+	return NULL;
+}
+
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
@@ -991,6 +1004,28 @@ int page_swapcount(struct page *page)
 	return count;
 }
 
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
+int __swp_swapcount(swp_entry_t entry)
+{
+	int count = 0;
+	pgoff_t offset;
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+
+	si = __swap_info_get(entry);
+	if (si) {
+		offset = swp_offset(entry);
+		ci = lock_cluster_or_swap_info(si, offset);
+		count = swap_count(si->swap_map[offset]);
+		unlock_cluster_or_swap_info(si, ci);
+	}
+	return count;
+}
+
 /*
  * How many references to @entry are currently swapped out?
  * This considers COUNT_CONTINUED so it returns exact answer.
-- 
cgit v1.2.3


From 28827256660357daf6e20995378f6a0e459103c6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:37 +1100
Subject: mm/swap: allocate swap slots in batches

Currently, the swap slots are allocated one page at a time, causing
contention to the swap_info lock protecting the swap partition on every
page being swapped.

This patch adds new functions get_swap_pages and scan_swap_map_slots to
request multiple swap slots at once.  This will reduces the lock
contention on the swap_info lock.  Also scan_swap_map_slots can operate
more efficiently as swap slots often occurs in clusters close to each
other on a swap device and it is quicker to allocate them together.

Link: http://lkml.kernel.org/r/9fec2845544371f62c3763d43510045e33d286a6.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |   2 +
 mm/swapfile.c        | 136 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 113 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3116382067cd..956eae8a8edf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -27,6 +27,7 @@ struct bio;
 #define SWAP_FLAGS_VALID	(SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
 				 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
 				 SWAP_FLAG_DISCARD_PAGES)
+#define SWAP_BATCH 64
 
 static inline int current_is_kswapd(void)
 {
@@ -386,6 +387,7 @@ static inline long get_nr_swap_pages(void)
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
 extern swp_entry_t get_swap_page_of_type(int);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[]);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2dd10f7544dd..cf4a20021cf0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -501,7 +501,7 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
  * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
  * might involve allocating a new cluster for current CPU too.
  */
-static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	unsigned long *offset, unsigned long *scan_base)
 {
 	struct percpu_cluster *cluster;
@@ -525,7 +525,7 @@ new_cluster:
 			*scan_base = *offset = si->cluster_next;
 			goto new_cluster;
 		} else
-			return;
+			return false;
 	}
 
 	found_free = false;
@@ -557,16 +557,22 @@ new_cluster:
 	cluster->next = tmp + 1;
 	*offset = tmp;
 	*scan_base = tmp;
+	return found_free;
 }
 
-static unsigned long scan_swap_map(struct swap_info_struct *si,
-				   unsigned char usage)
+static int scan_swap_map_slots(struct swap_info_struct *si,
+			       unsigned char usage, int nr,
+			       swp_entry_t slots[])
 {
 	struct swap_cluster_info *ci;
 	unsigned long offset;
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
 	int latency_ration = LATENCY_LIMIT;
+	int n_ret = 0;
+
+	if (nr > SWAP_BATCH)
+		nr = SWAP_BATCH;
 
 	/*
 	 * We try to cluster swap pages by allocating them sequentially
@@ -584,8 +590,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 	/* SSD algorithm */
 	if (si->cluster_info) {
-		scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
-		goto checks;
+		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+			goto checks;
+		else
+			goto scan;
 	}
 
 	if (unlikely(!si->cluster_nr--)) {
@@ -629,8 +637,14 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 checks:
 	if (si->cluster_info) {
-		while (scan_swap_map_ssd_cluster_conflict(si, offset))
-			scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+		/* take a break if we already got some slots */
+			if (n_ret)
+				goto done;
+			if (!scan_swap_map_try_ssd_cluster(si, &offset,
+							&scan_base))
+				goto scan;
+		}
 	}
 	if (!(si->flags & SWP_WRITEOK))
 		goto no_page;
@@ -655,7 +669,10 @@ checks:
 
 	if (si->swap_map[offset]) {
 		unlock_cluster(ci);
-		goto scan;
+		if (!n_ret)
+			goto scan;
+		else
+			goto done;
 	}
 
 	if (offset == si->lowest_bit)
@@ -674,9 +691,43 @@ checks:
 	inc_cluster_info_page(si, si->cluster_info, offset);
 	unlock_cluster(ci);
 	si->cluster_next = offset + 1;
-	si->flags -= SWP_SCANNING;
+	slots[n_ret++] = swp_entry(si->type, offset);
+
+	/* got enough slots or reach max slots? */
+	if ((n_ret == nr) || (offset >= si->highest_bit))
+		goto done;
+
+	/* search for next available slot */
+
+	/* time to take a break? */
+	if (unlikely(--latency_ration < 0)) {
+		if (n_ret)
+			goto done;
+		spin_unlock(&si->lock);
+		cond_resched();
+		spin_lock(&si->lock);
+		latency_ration = LATENCY_LIMIT;
+	}
 
-	return offset;
+	/* try to get more slots in cluster */
+	if (si->cluster_info) {
+		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+			goto checks;
+		else
+			goto done;
+	}
+	/* non-ssd case */
+	++offset;
+
+	/* non-ssd case, still more slots in cluster? */
+	if (si->cluster_nr && !si->swap_map[offset]) {
+		--si->cluster_nr;
+		goto checks;
+	}
+
+done:
+	si->flags -= SWP_SCANNING;
+	return n_ret;
 
 scan:
 	spin_unlock(&si->lock);
@@ -714,17 +765,41 @@ scan:
 
 no_page:
 	si->flags -= SWP_SCANNING;
-	return 0;
+	return n_ret;
 }
 
-swp_entry_t get_swap_page(void)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+				   unsigned char usage)
+{
+	swp_entry_t entry;
+	int n_ret;
+
+	n_ret = scan_swap_map_slots(si, usage, 1, &entry);
+
+	if (n_ret)
+		return swp_offset(entry);
+	else
+		return 0;
+
+}
+
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
 {
 	struct swap_info_struct *si, *next;
-	pgoff_t offset;
+	long avail_pgs;
+	int n_ret = 0;
 
-	if (atomic_long_read(&nr_swap_pages) <= 0)
+	avail_pgs = atomic_long_read(&nr_swap_pages);
+	if (avail_pgs <= 0)
 		goto noswap;
-	atomic_long_dec(&nr_swap_pages);
+
+	if (n_goal > SWAP_BATCH)
+		n_goal = SWAP_BATCH;
+
+	if (n_goal > avail_pgs)
+		n_goal = avail_pgs;
+
+	atomic_long_sub(n_goal, &nr_swap_pages);
 
 	spin_lock(&swap_avail_lock);
 
@@ -750,14 +825,14 @@ start_over:
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-
-		/* This is called for allocating swap entry for cache */
-		offset = scan_swap_map(si, SWAP_HAS_CACHE);
+		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+					    n_goal, swp_entries);
 		spin_unlock(&si->lock);
-		if (offset)
-			return swp_entry(si->type, offset);
+		if (n_ret)
+			goto check_out;
 		pr_debug("scan_swap_map of si %d failed to find offset\n",
-		       si->type);
+			si->type);
+
 		spin_lock(&swap_avail_lock);
 nextsi:
 		/*
@@ -768,7 +843,8 @@ nextsi:
 		 * up between us dropping swap_avail_lock and taking si->lock.
 		 * Since we dropped the swap_avail_lock, the swap_avail_head
 		 * list may have been modified; so if next is still in the
-		 * swap_avail_head list then try it, otherwise start over.
+		 * swap_avail_head list then try it, otherwise start over
+		 * if we have not gotten any slots.
 		 */
 		if (plist_node_empty(&next->avail_list))
 			goto start_over;
@@ -776,9 +852,19 @@ nextsi:
 
 	spin_unlock(&swap_avail_lock);
 
-	atomic_long_inc(&nr_swap_pages);
+check_out:
+	if (n_ret < n_goal)
+		atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
 noswap:
-	return (swp_entry_t) {0};
+	return n_ret;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	swp_entry_t entry;
+
+	get_swap_pages(1, &entry);
+	return entry;
 }
 
 /* The only caller of this function is now suspend routine */
-- 
cgit v1.2.3


From b1542ea18635162fc982e524eaa9546cf3965fb6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:38 +1100
Subject: mm/swap: free swap slots in batch

Add new functions that free unused swap slots in batches without the need
to reacquire swap info lock.  This improves scalability and reduce lock
contention.

Link: http://lkml.kernel.org/r/c25e0fcdfd237ec4ca7db91631d3b9f6ed23824e.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |   1 +
 mm/swapfile.c        | 155 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 95 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 956eae8a8edf..bcc0b18f96d2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -394,6 +394,7 @@ extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
 extern void swapcache_free(swp_entry_t);
+extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cf4a20021cf0..58064d742723 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -947,35 +947,34 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 	return p;
 }
 
-static unsigned char swap_entry_free(struct swap_info_struct *p,
-				     swp_entry_t entry, unsigned char usage,
-				     bool swap_info_locked)
+static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
+					struct swap_info_struct *q)
+{
+	struct swap_info_struct *p;
+
+	p = _swap_info_get(entry);
+
+	if (p != q) {
+		if (q != NULL)
+			spin_unlock(&q->lock);
+		if (p != NULL)
+			spin_lock(&p->lock);
+	}
+	return p;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+				       swp_entry_t entry, unsigned char usage)
 {
 	struct swap_cluster_info *ci;
 	unsigned long offset = swp_offset(entry);
 	unsigned char count;
 	unsigned char has_cache;
-	bool lock_swap_info = false;
-
-	if (!swap_info_locked) {
-		count = p->swap_map[offset];
-		if (!p->cluster_info || count == usage || count == SWAP_MAP_SHMEM) {
-lock_swap_info:
-			swap_info_locked = true;
-			lock_swap_info = true;
-			spin_lock(&p->lock);
-		}
-	}
 
-	ci = lock_cluster(p, offset);
+	ci = lock_cluster_or_swap_info(p, offset);
 
 	count = p->swap_map[offset];
 
-	if (!swap_info_locked && (count == usage || count == SWAP_MAP_SHMEM)) {
-		unlock_cluster(ci);
-		goto lock_swap_info;
-	}
-
 	has_cache = count & SWAP_HAS_CACHE;
 	count &= ~SWAP_HAS_CACHE;
 
@@ -999,46 +998,52 @@ lock_swap_info:
 	}
 
 	usage = count | has_cache;
-	p->swap_map[offset] = usage;
+	p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+
+	unlock_cluster_or_swap_info(p, ci);
+
+	return usage;
+}
 
+static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+{
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+	unsigned char count;
+
+	ci = lock_cluster(p, offset);
+	count = p->swap_map[offset];
+	VM_BUG_ON(count != SWAP_HAS_CACHE);
+	p->swap_map[offset] = 0;
+	dec_cluster_info_page(p, p->cluster_info, offset);
 	unlock_cluster(ci);
 
-	/* free if no reference */
-	if (!usage) {
-		VM_BUG_ON(!swap_info_locked);
-		mem_cgroup_uncharge_swap(entry);
-		ci = lock_cluster(p, offset);
-		dec_cluster_info_page(p, p->cluster_info, offset);
-		unlock_cluster(ci);
-		if (offset < p->lowest_bit)
-			p->lowest_bit = offset;
-		if (offset > p->highest_bit) {
-			bool was_full = !p->highest_bit;
-			p->highest_bit = offset;
-			if (was_full && (p->flags & SWP_WRITEOK)) {
-				spin_lock(&swap_avail_lock);
-				WARN_ON(!plist_node_empty(&p->avail_list));
-				if (plist_node_empty(&p->avail_list))
-					plist_add(&p->avail_list,
-						  &swap_avail_head);
-				spin_unlock(&swap_avail_lock);
-			}
-		}
-		atomic_long_inc(&nr_swap_pages);
-		p->inuse_pages--;
-		frontswap_invalidate_page(p->type, offset);
-		if (p->flags & SWP_BLKDEV) {
-			struct gendisk *disk = p->bdev->bd_disk;
-			if (disk->fops->swap_slot_free_notify)
-				disk->fops->swap_slot_free_notify(p->bdev,
-								  offset);
+	mem_cgroup_uncharge_swap(entry);
+	if (offset < p->lowest_bit)
+		p->lowest_bit = offset;
+	if (offset > p->highest_bit) {
+		bool was_full = !p->highest_bit;
+
+		p->highest_bit = offset;
+		if (was_full && (p->flags & SWP_WRITEOK)) {
+			spin_lock(&swap_avail_lock);
+			WARN_ON(!plist_node_empty(&p->avail_list));
+			if (plist_node_empty(&p->avail_list))
+				plist_add(&p->avail_list,
+					  &swap_avail_head);
+			spin_unlock(&swap_avail_lock);
 		}
 	}
+	atomic_long_inc(&nr_swap_pages);
+	p->inuse_pages--;
+	frontswap_invalidate_page(p->type, offset);
+	if (p->flags & SWP_BLKDEV) {
+		struct gendisk *disk = p->bdev->bd_disk;
 
-	if (lock_swap_info)
-		spin_unlock(&p->lock);
-
-	return usage;
+		if (disk->fops->swap_slot_free_notify)
+			disk->fops->swap_slot_free_notify(p->bdev,
+							  offset);
+	}
 }
 
 /*
@@ -1050,8 +1055,10 @@ void swap_free(swp_entry_t entry)
 	struct swap_info_struct *p;
 
 	p = _swap_info_get(entry);
-	if (p)
-		swap_entry_free(p, entry, 1, false);
+	if (p) {
+		if (!__swap_entry_free(p, entry, 1))
+			swapcache_free_entries(&entry, 1);
+	}
 }
 
 /*
@@ -1062,8 +1069,32 @@ void swapcache_free(swp_entry_t entry)
 	struct swap_info_struct *p;
 
 	p = _swap_info_get(entry);
+	if (p) {
+		if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
+			swapcache_free_entries(&entry, 1);
+	}
+}
+
+void swapcache_free_entries(swp_entry_t *entries, int n)
+{
+	struct swap_info_struct *p, *prev;
+	int i;
+
+	if (n <= 0)
+		return;
+
+	prev = NULL;
+	p = NULL;
+	for (i = 0; i < n; ++i) {
+		p = swap_info_get_cont(entries[i], prev);
+		if (p)
+			swap_entry_free(p, entries[i]);
+		else
+			break;
+		prev = p;
+	}
 	if (p)
-		swap_entry_free(p, entry, SWAP_HAS_CACHE, false);
+		spin_unlock(&p->lock);
 }
 
 /*
@@ -1246,21 +1277,23 @@ int free_swap_and_cache(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 	struct page *page = NULL;
+	unsigned char count;
 
 	if (non_swap_entry(entry))
 		return 1;
 
-	p = swap_info_get(entry);
+	p = _swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, entry, 1, true) == SWAP_HAS_CACHE) {
+		count = __swap_entry_free(p, entry, 1);
+		if (count == SWAP_HAS_CACHE) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
 				put_page(page);
 				page = NULL;
 			}
-		}
-		spin_unlock(&p->lock);
+		} else if (!count)
+			swapcache_free_entries(&entry, 1);
 	}
 	if (page) {
 		/*
-- 
cgit v1.2.3


From e8a00a82766735d31a710cef0de9a3606923a18d Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:38 +1100
Subject: mm/swap: add cache for swap slots allocation

We add per cpu caches for swap slots that can be allocated and freed
quickly without the need to touch the swap info lock.

Two separate caches are maintained for swap slots allocated and swap slots
returned.  This is to allow the swap slots to be returned to the global
pool in a batch so they will have a chance to be coaelesced with other
slots in a cluster.  We do not reuse the slots that are returned right
away, as it may increase fragmentation of the slots.

The swap allocation cache is protected by a mutex as we may sleep when
searching for empty slots in cache.  The swap free cache is protected by a
spin lock as we cannot sleep in the free path.

We refill the swap slots cache when we run out of slots, and we disable
the swap slots cache and drain the slots if the global number of slots
fall below a low watermark threshold.  We re-enable the cache agian when
the slots available are above a high watermark.

Link: http://lkml.kernel.org/r/35de301a4eaa8daa2977de6e987f2c154385eb66.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h       |   4 +
 include/linux/swap_slots.h |  28 ++++
 mm/Makefile                |   2 +-
 mm/swap_slots.c            | 333 +++++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c            |   1 +
 mm/swapfile.c              |  26 ++--
 6 files changed, 382 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/swap_slots.h
 create mode 100644 mm/swap_slots.c

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bcc0b18f96d2..45e91dd6716d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -372,6 +372,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern bool has_usable_swap(void);
 
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
@@ -410,6 +411,9 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 
+extern int get_swap_slots(int n, swp_entry_t *slots);
+extern void swapcache_free_batch(swp_entry_t *entries, int n);
+
 #else /* CONFIG_SWAP */
 
 #define swap_address_space(entry)		(NULL)
diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
new file mode 100644
index 000000000000..a59e6e2f2c47
--- /dev/null
+++ b/include/linux/swap_slots.h
@@ -0,0 +1,28 @@
+#ifndef _LINUX_SWAP_SLOTS_H
+#define _LINUX_SWAP_SLOTS_H
+
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+#define SWAP_SLOTS_CACHE_SIZE			SWAP_BATCH
+#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE	(5*SWAP_SLOTS_CACHE_SIZE)
+#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE	(2*SWAP_SLOTS_CACHE_SIZE)
+
+struct swap_slots_cache {
+	bool		lock_initialized;
+	struct mutex	alloc_lock;
+	swp_entry_t	*slots;
+	int		nr;
+	int		cur;
+	spinlock_t	free_lock;
+	swp_entry_t	*slots_ret;
+	int		n_ret;
+};
+
+void disable_swap_slots_cache_lock(void);
+void reenable_swap_slots_cache_unlock(void);
+int enable_swap_slots_cache(void);
+int free_swap_slot(swp_entry_t entry);
+
+#endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a9f76b..433eaf9a876e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,7 +35,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
-			   compaction.o vmacache.o \
+			   compaction.o vmacache.o swap_slots.o \
 			   interval_tree.o list_lru.o workingset.o \
 			   debug.o $(mmu-y)
 
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
new file mode 100644
index 000000000000..b8168392d039
--- /dev/null
+++ b/mm/swap_slots.c
@@ -0,0 +1,333 @@
+/*
+ * Manage cache of swap slots to be used for and returned from
+ * swap.
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * We allocate the swap slots from the global pool and put
+ * it into local per cpu caches.  This has the advantage
+ * of no needing to acquire the swap_info lock every time
+ * we need a new slot.
+ *
+ * There is also opportunity to simply return the slot
+ * to local caches without needing to acquire swap_info
+ * lock.  We do not reuse the returned slots directly but
+ * move them back to the global pool in a batch.  This
+ * allows the slots to coaellesce and reduce fragmentation.
+ *
+ * The swap entry allocated is marked with SWAP_HAS_CACHE
+ * flag in map_count that prevents it from being allocated
+ * again from the global pool.
+ *
+ * The swap slots cache is protected by a mutex instead of
+ * a spin lock as when we search for slots with scan_swap_map,
+ * we can possibly sleep.
+ */
+
+#include <linux/swap_slots.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_SWAP
+
+static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
+static bool	swap_slot_cache_active;
+static bool	swap_slot_cache_enabled;
+static bool	swap_slot_cache_initialized;
+DEFINE_MUTEX(swap_slots_cache_mutex);
+/* Serialize swap slots cache enable/disable operations */
+DEFINE_MUTEX(swap_slots_cache_enable_mutex);
+
+static void __drain_swap_slots_cache(unsigned int type);
+static void deactivate_swap_slots_cache(void);
+static void reactivate_swap_slots_cache(void);
+
+#define use_swap_slot_cache (swap_slot_cache_active && \
+		swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define SLOTS_CACHE 0x1
+#define SLOTS_CACHE_RET 0x2
+
+static void deactivate_swap_slots_cache(void)
+{
+	mutex_lock(&swap_slots_cache_mutex);
+	swap_slot_cache_active = false;
+	__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+	mutex_unlock(&swap_slots_cache_mutex);
+}
+
+static void reactivate_swap_slots_cache(void)
+{
+	mutex_lock(&swap_slots_cache_mutex);
+	swap_slot_cache_active = true;
+	mutex_unlock(&swap_slots_cache_mutex);
+}
+
+/* Must not be called with cpu hot plug lock */
+void disable_swap_slots_cache_lock(void)
+{
+	mutex_lock(&swap_slots_cache_enable_mutex);
+	swap_slot_cache_enabled = false;
+	if (swap_slot_cache_initialized) {
+		/* serialize with cpu hotplug operations */
+		get_online_cpus();
+		__drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET);
+		put_online_cpus();
+	}
+}
+
+static void __reenable_swap_slots_cache(void)
+{
+	swap_slot_cache_enabled = has_usable_swap();
+}
+
+void reenable_swap_slots_cache_unlock(void)
+{
+	__reenable_swap_slots_cache();
+	mutex_unlock(&swap_slots_cache_enable_mutex);
+}
+
+static bool check_cache_active(void)
+{
+	long pages;
+
+	if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+		return false;
+
+	pages = get_nr_swap_pages();
+	if (!swap_slot_cache_active) {
+		if (pages > num_online_cpus() *
+		    THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE)
+			reactivate_swap_slots_cache();
+		goto out;
+	}
+
+	/* if global pool of slot caches too low, deactivate cache */
+	if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE)
+		deactivate_swap_slots_cache();
+out:
+	return swap_slot_cache_active;
+}
+
+static int alloc_swap_slot_cache(unsigned int cpu)
+{
+	struct swap_slots_cache *cache;
+	swp_entry_t *slots, *slots_ret;
+
+	/*
+	 * Do allocation outside swap_slots_cache_mutex
+	 * as vzalloc could trigger reclaim and get_swap_page,
+	 * which can lock swap_slots_cache_mutex.
+	 */
+	slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+	if (!slots)
+		return -ENOMEM;
+
+	slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+	if (!slots_ret) {
+		vfree(slots);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&swap_slots_cache_mutex);
+	cache = &per_cpu(swp_slots, cpu);
+	if (cache->slots || cache->slots_ret)
+		/* cache already allocated */
+		goto out;
+	if (!cache->lock_initialized) {
+		mutex_init(&cache->alloc_lock);
+		spin_lock_init(&cache->free_lock);
+		cache->lock_initialized = true;
+	}
+	cache->nr = 0;
+	cache->cur = 0;
+	cache->n_ret = 0;
+	cache->slots = slots;
+	slots = NULL;
+	cache->slots_ret = slots_ret;
+	slots_ret = NULL;
+out:
+	mutex_unlock(&swap_slots_cache_mutex);
+	if (slots)
+		vfree(slots);
+	if (slots_ret)
+		vfree(slots_ret);
+	return 0;
+}
+
+static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
+				  bool free_slots)
+{
+	struct swap_slots_cache *cache;
+	swp_entry_t *slots = NULL;
+
+	cache = &per_cpu(swp_slots, cpu);
+	if ((type & SLOTS_CACHE) && cache->slots) {
+		mutex_lock(&cache->alloc_lock);
+		swapcache_free_entries(cache->slots + cache->cur, cache->nr);
+		cache->cur = 0;
+		cache->nr = 0;
+		if (free_slots && cache->slots) {
+			vfree(cache->slots);
+			cache->slots = NULL;
+		}
+		mutex_unlock(&cache->alloc_lock);
+	}
+	if ((type & SLOTS_CACHE_RET) && cache->slots_ret) {
+		spin_lock_irq(&cache->free_lock);
+		swapcache_free_entries(cache->slots_ret, cache->n_ret);
+		cache->n_ret = 0;
+		if (free_slots && cache->slots_ret) {
+			slots = cache->slots_ret;
+			cache->slots_ret = NULL;
+		}
+		spin_unlock_irq(&cache->free_lock);
+		if (slots)
+			vfree(slots);
+	}
+}
+
+static void __drain_swap_slots_cache(unsigned int type)
+{
+	unsigned int cpu;
+
+	/*
+	 * This function is called during
+	 *	1) swapoff, when we have to make sure no
+	 *	   left over slots are in cache when we remove
+	 *	   a swap device;
+	 *      2) disabling of swap slot cache, when we run low
+	 *	   on swap slots when allocating memory and need
+	 *	   to return swap slots to global pool.
+	 *
+	 * We cannot acquire cpu hot plug lock here as
+	 * this function can be invoked in the cpu
+	 * hot plug path:
+	 * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
+	 *   -> memory allocation -> direct reclaim -> get_swap_page
+	 *   -> drain_swap_slots_cache
+	 *
+	 * Hence the loop over current online cpu below could miss cpu that
+	 * is being brought online but not yet marked as online.
+	 * That is okay as we do not schedule and run anything on a
+	 * cpu before it has been marked online. Hence, we will not
+	 * fill any swap slots in slots cache of such cpu.
+	 * There are no slots on such cpu that need to be drained.
+	 */
+	for_each_online_cpu(cpu)
+		drain_slots_cache_cpu(cpu, type, false);
+}
+
+static int free_slot_cache(unsigned int cpu)
+{
+	mutex_lock(&swap_slots_cache_mutex);
+	drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true);
+	mutex_unlock(&swap_slots_cache_mutex);
+	return 0;
+}
+
+int enable_swap_slots_cache(void)
+{
+	int ret = 0;
+
+	mutex_lock(&swap_slots_cache_enable_mutex);
+	if (swap_slot_cache_initialized) {
+		__reenable_swap_slots_cache();
+		goto out_unlock;
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+				alloc_swap_slot_cache, free_slot_cache);
+	if (ret < 0)
+		goto out_unlock;
+	swap_slot_cache_initialized = true;
+	__reenable_swap_slots_cache();
+out_unlock:
+	mutex_unlock(&swap_slots_cache_enable_mutex);
+	return 0;
+}
+
+/* called with swap slot cache's alloc lock held */
+static int refill_swap_slots_cache(struct swap_slots_cache *cache)
+{
+	if (!use_swap_slot_cache || cache->nr)
+		return 0;
+
+	cache->cur = 0;
+	if (swap_slot_cache_active)
+		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+
+	return cache->nr;
+}
+
+int free_swap_slot(swp_entry_t entry)
+{
+	struct swap_slots_cache *cache;
+
+	BUG_ON(!swap_slot_cache_initialized);
+
+	cache = &get_cpu_var(swp_slots);
+	if (use_swap_slot_cache && cache->slots_ret) {
+		spin_lock_irq(&cache->free_lock);
+		/* Swap slots cache may be deactivated before acquiring lock */
+		if (!use_swap_slot_cache) {
+			spin_unlock_irq(&cache->free_lock);
+			goto direct_free;
+		}
+		if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) {
+			/*
+			 * Return slots to global pool.
+			 * The current swap_map value is SWAP_HAS_CACHE.
+			 * Set it to 0 to indicate it is available for
+			 * allocation in global pool
+			 */
+			swapcache_free_entries(cache->slots_ret, cache->n_ret);
+			cache->n_ret = 0;
+		}
+		cache->slots_ret[cache->n_ret++] = entry;
+		spin_unlock_irq(&cache->free_lock);
+	} else {
+direct_free:
+		swapcache_free_entries(&entry, 1);
+	}
+	put_cpu_var(swp_slots);
+
+	return 0;
+}
+
+swp_entry_t get_swap_page(void)
+{
+	swp_entry_t entry, *pentry;
+	struct swap_slots_cache *cache;
+
+	cache = this_cpu_ptr(&swp_slots);
+
+	entry.val = 0;
+	if (check_cache_active()) {
+		mutex_lock(&cache->alloc_lock);
+		if (cache->slots) {
+repeat:
+			if (cache->nr) {
+				pentry = &cache->slots[cache->cur++];
+				entry = *pentry;
+				pentry->val = 0;
+				cache->nr--;
+			} else {
+				if (refill_swap_slots_cache(cache))
+					goto repeat;
+			}
+		}
+		mutex_unlock(&cache->alloc_lock);
+		if (entry.val)
+			return entry;
+	}
+
+	get_swap_pages(1, &entry);
+
+	return entry;
+}
+
+#endif /* CONFIG_SWAP */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3d76d80c07d6..e1f07cafecaa 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -18,6 +18,7 @@
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/vmalloc.h>
+#include <linux/swap_slots.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58064d742723..47215be71a59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -34,6 +34,7 @@
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
 #include <linux/export.h>
+#include <linux/swap_slots.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -859,14 +860,6 @@ noswap:
 	return n_ret;
 }
 
-swp_entry_t get_swap_page(void)
-{
-	swp_entry_t entry;
-
-	get_swap_pages(1, &entry);
-	return entry;
-}
-
 /* The only caller of this function is now suspend routine */
 swp_entry_t get_swap_page_of_type(int type)
 {
@@ -1057,7 +1050,7 @@ void swap_free(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		if (!__swap_entry_free(p, entry, 1))
-			swapcache_free_entries(&entry, 1);
+			free_swap_slot(entry);
 	}
 }
 
@@ -1071,7 +1064,7 @@ void swapcache_free(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
-			swapcache_free_entries(&entry, 1);
+			free_swap_slot(entry);
 	}
 }
 
@@ -1293,7 +1286,7 @@ int free_swap_and_cache(swp_entry_t entry)
 				page = NULL;
 			}
 		} else if (!count)
-			swapcache_free_entries(&entry, 1);
+			free_swap_slot(entry);
 	}
 	if (page) {
 		/*
@@ -2121,6 +2114,17 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 	spin_unlock(&swap_lock);
 }
 
+bool has_usable_swap(void)
+{
+	bool ret = true;
+
+	spin_lock(&swap_lock);
+	if (plist_head_empty(&swap_active_head))
+		ret = false;
+	spin_unlock(&swap_lock);
+	return ret;
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
-- 
cgit v1.2.3


From c5a04c2756aa838f126819cf017f8da283da8689 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:39 +1100
Subject: mm/swap: add comments on locks in swap_slots.h

Explains what each lock protects in swap_slots_cache structure.

Link: http://lkml.kernel.org/r/20170118180327.GA24225@linux.intel.com
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap_slots.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
index a59e6e2f2c47..ba5623b27c60 100644
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -11,11 +11,11 @@
 
 struct swap_slots_cache {
 	bool		lock_initialized;
-	struct mutex	alloc_lock;
+	struct mutex	alloc_lock; /* protects slots, nr, cur */
 	swp_entry_t	*slots;
 	int		nr;
 	int		cur;
-	spinlock_t	free_lock;
+	spinlock_t	free_lock;  /* protects slots_ret, n_ret */
 	swp_entry_t	*slots_ret;
 	int		n_ret;
 };
-- 
cgit v1.2.3


From 236b801e685c1986fabdd79175202652a51f1427 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 8 Feb 2017 10:26:40 +1100
Subject: mm/swap: skip readahead only when swap slot cache is enabled

Because during swap off, a swap entry may have swap_map[] ==
SWAP_HAS_CACHE (for example, just allocated).  If we return NULL in
__read_swap_cache_async(), the swap off will abort.  So when swap slot
cache is disabled, (for swap off), we will wait for page to be put into
swap cache in such race condition.  This should not be a problem for swap
slot cache, because swap slot cache should be drained after clearing
swap_slot_cache_enabled.

Link: http://lkml.kernel.org/r/5e2c5f6abe8e6eb0797408897b1bba80938e9b9d.1484082593.git.tim.c.chen@linux.intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net> escreveu:
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap_slots.h |  2 ++
 mm/swap_slots.c            |  2 +-
 mm/swap_state.c            | 11 +++++++++--
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h
index ba5623b27c60..6ef92d17633d 100644
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -25,4 +25,6 @@ void reenable_swap_slots_cache_unlock(void);
 int enable_swap_slots_cache(void);
 int free_swap_slot(swp_entry_t entry);
 
+extern bool swap_slot_cache_enabled;
+
 #endif /* _LINUX_SWAP_SLOTS_H */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index ebf4f1cbac04..9b5bc86f96ad 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -36,7 +36,7 @@
 
 static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
 static bool	swap_slot_cache_active;
-static bool	swap_slot_cache_enabled;
+bool	swap_slot_cache_enabled;
 static bool	swap_slot_cache_initialized;
 DEFINE_MUTEX(swap_slots_cache_mutex);
 /* Serialize swap slots cache enable/disable operations */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e1f07cafecaa..2126e9ba23b2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -324,8 +324,15 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (found_page)
 			break;
 
-		/* Just skip read ahead for unused swap slot */
-		if (!__swp_swapcount(entry))
+		/*
+		 * Just skip read ahead for unused swap slot.
+		 * During swap_off when swap_slot_cache is disabled,
+		 * we have to handle the race between putting
+		 * swap entry in swap cache and marking swap slot
+		 * as SWAP_HAS_CACHE.  That's done in later part of code or
+		 * else swap_off will be aborted if we return NULL.
+		 */
+		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
 			return NULL;
 
 		/*
-- 
cgit v1.2.3


From 2b53b238a697f65c3a311baf0c3bd7503be44b73 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 8 Feb 2017 10:26:40 +1100
Subject: mm, thp: add new defer+madvise defrag option

There is no thp defrag option that currently allows MADV_HUGEPAGE regions
to do direct compaction and reclaim while all other thp allocations simply
trigger kswapd and kcompactd in the background and fail immediately.

The "defer" setting simply triggers background reclaim and compaction for
all regions, regardless of MADV_HUGEPAGE, which makes it unusable for our
userspace where MADV_HUGEPAGE is being used to indicate the application is
willing to wait for work for thp memory to be available.

The "madvise" setting will do direct compaction and reclaim for these
MADV_HUGEPAGE regions, but does not trigger kswapd and kcompactd in the
background for anybody else.

For reasonable usage, there needs to be a mesh between the two options.
This patch introduces a fifth mode, "defer+madvise", that will do direct
reclaim and compaction for MADV_HUGEPAGE regions and trigger background
reclaim and compaction for everybody else so that hugepages may be
available in the near future.

A proposal to allow direct reclaim and compaction for MADV_HUGEPAGE
regions as part of the "defer" mode, making it a very powerful setting and
avoids breaking userspace, was offered:
http://marc.info/?t=148236612700003.  This additional mode is a
compromise.

A second proposal to allow both "defer" and "madvise" to be selected at
the same time was also offered: http://marc.info/?t=148357345300001.  This
is possible, but there was a concern that it might break existing
userspaces the parse the output of the defrag mode, so the fifth option
was introduced instead.

This patch also cleans up the helper function for storing to "enabled" and
"defrag" since the former supports three modes while the latter supports
five and triple_flag_store() was getting unnecessarily messy.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701101614330.41805@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/vm/transhuge.txt |   8 ++-
 include/linux/huge_mm.h        |   1 +
 mm/huge_memory.c               | 146 +++++++++++++++++++++--------------------
 3 files changed, 82 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index c4171e4519c2..8fda5b7b24e9 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -110,6 +110,7 @@ MADV_HUGEPAGE region.
 
 echo always >/sys/kernel/mm/transparent_hugepage/defrag
 echo defer >/sys/kernel/mm/transparent_hugepage/defrag
+echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
 echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
 echo never >/sys/kernel/mm/transparent_hugepage/defrag
 
@@ -120,10 +121,15 @@ that benefit heavily from THP use and are willing to delay the VM start
 to utilise them.
 
 "defer" means that an application will wake kswapd in the background
-to reclaim pages and wake kcompact to compact memory so that THP is
+to reclaim pages and wake kcompactd to compact memory so that THP is
 available in the near future. It's the responsibility of khugepaged
 to then install the THP pages later.
 
+"defer+madvise" will enter direct reclaim and compaction like "always", but
+only for regions that have used madvise(MADV_HUGEPAGE); all other regions
+will wake kswapd in the background to reclaim pages and wake kcompactd to
+compact memory so that THP is available in the near future.
+
 "madvise" will enter direct reclaim like "always" but only for regions
 that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 97e478d6b690..f0029e786205 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -33,6 +33,7 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5f3ad65c85de..f9ecc2aeadfc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = {
 };
 
 #ifdef CONFIG_SYSFS
-
-static ssize_t triple_flag_store(struct kobject *kobj,
-				 struct kobj_attribute *attr,
-				 const char *buf, size_t count,
-				 enum transparent_hugepage_flag enabled,
-				 enum transparent_hugepage_flag deferred,
-				 enum transparent_hugepage_flag req_madv)
-{
-	if (!memcmp("defer", buf,
-		    min(sizeof("defer")-1, count))) {
-		if (enabled == deferred)
-			return -EINVAL;
-		clear_bit(enabled, &transparent_hugepage_flags);
-		clear_bit(req_madv, &transparent_hugepage_flags);
-		set_bit(deferred, &transparent_hugepage_flags);
-	} else if (!memcmp("always", buf,
-		    min(sizeof("always")-1, count))) {
-		clear_bit(deferred, &transparent_hugepage_flags);
-		clear_bit(req_madv, &transparent_hugepage_flags);
-		set_bit(enabled, &transparent_hugepage_flags);
-	} else if (!memcmp("madvise", buf,
-			   min(sizeof("madvise")-1, count))) {
-		clear_bit(enabled, &transparent_hugepage_flags);
-		clear_bit(deferred, &transparent_hugepage_flags);
-		set_bit(req_madv, &transparent_hugepage_flags);
-	} else if (!memcmp("never", buf,
-			   min(sizeof("never")-1, count))) {
-		clear_bit(enabled, &transparent_hugepage_flags);
-		clear_bit(req_madv, &transparent_hugepage_flags);
-		clear_bit(deferred, &transparent_hugepage_flags);
-	} else
-		return -EINVAL;
-
-	return count;
-}
-
 static ssize_t enabled_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *buf)
 {
@@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj,
 			     struct kobj_attribute *attr,
 			     const char *buf, size_t count)
 {
-	ssize_t ret;
+	ssize_t ret = count;
 
-	ret = triple_flag_store(kobj, attr, buf, count,
-				TRANSPARENT_HUGEPAGE_FLAG,
-				TRANSPARENT_HUGEPAGE_FLAG,
-				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+	if (!memcmp("always", buf,
+		    min(sizeof("always")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("madvise", buf,
+			   min(sizeof("madvise")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("never", buf,
+			   min(sizeof("never")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else
+		ret = -EINVAL;
 
 	if (ret > 0) {
 		int err = start_stop_khugepaged();
 		if (err)
 			ret = err;
 	}
-
 	return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
 	return count;
 }
 
-/*
- * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
- * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
- * memory just to allocate one more hugepage.
- */
 static ssize_t defrag_show(struct kobject *kobj,
 			   struct kobj_attribute *attr, char *buf)
 {
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
-		return sprintf(buf, "[always] defer madvise never\n");
+		return sprintf(buf, "[always] defer defer+madvise madvise never\n");
 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
-		return sprintf(buf, "always [defer] madvise never\n");
-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
-		return sprintf(buf, "always defer [madvise] never\n");
-	else
-		return sprintf(buf, "always defer madvise [never]\n");
-
+		return sprintf(buf, "always [defer] defer+madvise madvise never\n");
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always defer [defer+madvise] madvise never\n");
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always defer defer+madvise [madvise] never\n");
+	return sprintf(buf, "always defer defer+madvise madvise [never]\n");
 }
+
 static ssize_t defrag_store(struct kobject *kobj,
 			    struct kobj_attribute *attr,
 			    const char *buf, size_t count)
 {
-	return triple_flag_store(kobj, attr, buf, count,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+	if (!memcmp("always", buf,
+		    min(sizeof("always")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("defer", buf,
+		    min(sizeof("defer")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("defer+madvise", buf,
+		    min(sizeof("defer+madvise")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("madvise", buf,
+			   min(sizeof("madvise")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else if (!memcmp("never", buf,
+			   min(sizeof("never")-1, count))) {
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
+	} else
+		return -EINVAL;
+
+	return count;
 }
 static struct kobj_attribute defrag_attr =
 	__ATTR(defrag, 0644, defrag_show, defrag_store);
@@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 }
 
 /*
- * If THP defrag is set to always then directly reclaim/compact as necessary
- * If set to defer then do only background reclaim/compact and defer to khugepaged
- * If set to madvise and the VMA is flagged then directly reclaim/compact
- * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
+ * always: directly stall for all thp allocations
+ * defer: wake kswapd and fail if not immediately available
+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
+ *		  fail if not immediately available
+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
+ *	    available
+ * never: never stall for any thp allocation
  */
 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
-	bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 
-	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
-				&transparent_hugepage_flags) && vma_madvised)
-		return GFP_TRANSHUGE;
-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
-						&transparent_hugepage_flags))
-		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
-						&transparent_hugepage_flags))
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
-
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
+		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+							     __GFP_KSWAPD_RECLAIM);
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
+							     0);
 	return GFP_TRANSHUGE_LIGHT;
 }
 
-- 
cgit v1.2.3


From 951ea784aa226bae80d0d7db0e59fc9d7766f798 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:41 +1100
Subject: mm, vmscan: cleanup lru size claculations

lruvec_lru_size returns the full size of the LRU list while we sometimes
need a value reduced only to eligible zones (e.g. for lowmem requests).
inactive_list_is_low is one such user. Later patches will add more of
them. Add a new parameter to lruvec_lru_size and allow it filter out
zones which are not eligible for the given context.

Link: http://lkml.kernel.org/r/20170117103702.28542-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/vmscan.c            | 89 +++++++++++++++++++++++++-------------------------
 mm/workingset.c        |  2 +-
 3 files changed, 46 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f4aac87adcc3..82fc632fd11d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -779,7 +779,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 277e105646a5..e7c75a3c6b53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,22 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
 		pgdat_reclaimable_pages(pgdat) * 6;
 }
 
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size -  Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
+	unsigned long lru_size;
+	int zid;
+
 	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_lru_size(lruvec, lru);
+		lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+	else
+		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
-	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-}
+	for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+		struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+		unsigned long size;
 
-unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru,
-				   int zone_idx)
-{
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx);
+		if (!managed_zone(zone))
+			continue;
+
+		if (!mem_cgroup_disabled())
+			size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+		else
+			size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+				       NR_ZONE_LRU_BASE + lru);
+		lru_size -= min(size, lru_size);
+	}
+
+	return lru_size;
 
-	return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx],
-			       NR_ZONE_LRU_BASE + lru);
 }
 
 /*
@@ -2049,11 +2066,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 						struct scan_control *sc, bool trace)
 {
 	unsigned long inactive_ratio;
-	unsigned long total_inactive, inactive;
-	unsigned long total_active, active;
+	unsigned long inactive, active;
+	enum lru_list inactive_lru = file * LRU_FILE;
+	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
 	unsigned long gb;
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-	int zid;
 
 	/*
 	 * If we don't have swap space, anonymous page deactivation
@@ -2062,27 +2078,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	if (!file && !total_swap_pages)
 		return false;
 
-	total_inactive = inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-	total_active = active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-	/*
-	 * For zone-constrained allocations, it is necessary to check if
-	 * deactivations are required for lowmem to be reclaimed. This
-	 * calculates the inactive/active pages available in eligible zones.
-	 */
-	for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-		struct zone *zone = &pgdat->node_zones[zid];
-		unsigned long inactive_zone, active_zone;
-
-		if (!managed_zone(zone))
-			continue;
-
-		inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid);
-		active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid);
-
-		inactive -= min(inactive, inactive_zone);
-		active -= min(active, active_zone);
-	}
+	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -2091,10 +2088,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 		inactive_ratio = 1;
 
 	if (trace)
-		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id,
+		trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
 				sc->reclaim_idx,
-				total_inactive, inactive,
-				total_active, active, inactive_ratio, file);
+				lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+				lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+				inactive_ratio, file);
+
 	return inactive * inactive_ratio < active;
 }
 
@@ -2234,7 +2233,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc, false) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2260,10 +2259,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon in [0], file in [1]
 	 */
 
-	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
 
 	spin_lock_irq(&pgdat->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2301,7 +2300,7 @@ out:
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru);
+			size = lruvec_lru_size(lruvec, lru, MAX_NR_ZONES);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index abb58ffa3c64..a67f5796b995 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -267,7 +267,7 @@ bool workingset_refault(void *shadow)
 	}
 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
-	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
 	rcu_read_unlock();
 
 	/*
-- 
cgit v1.2.3


From 3a2a6efb8437c62f82620c752e615a5c21a6d571 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:44 +1100
Subject: mm, page_alloc: warn_alloc print nodemask

warn_alloc is currently used for to report an allocation failure or an
allocation stall.  We print some details of the allocation request like
the gfp mask and the request order.  We do not print the allocation
nodemask which is important when debugging the reason for the allocation
failure as well.  We alreaddy print the nodemask in the OOM report.

Add nodemask to warn_alloc and print it in warn_alloc as well.

Link: http://lkml.kernel.org/r/20170117091543.25850-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  4 ++--
 mm/page_alloc.c    | 10 ++++++----
 mm/vmalloc.c       |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e9012babd322..c021e39ab0e6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1938,8 +1938,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern unsigned long arch_reserved_kernel_pages(void);
 #endif
 
-extern __printf(2, 3)
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...);
+extern __printf(3, 4)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
 
 extern void setup_per_cpu_pageset(void);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cf6b53dc08f9..96c8fe602dfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3028,12 +3028,13 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
 	show_mem(filter);
 }
 
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
+	nodemask_t *nm = (nodemask) ? nodemask : &cpuset_current_mems_allowed;
 
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
@@ -3047,7 +3048,8 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 	pr_cont("%pV", &vaf);
 	va_end(args);
 
-	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+	pr_cont(", mode:%#x(%pGg), nodemask=%*pbl\n", gfp_mask, &gfp_mask, nodemask_pr_args(nm));
+	cpuset_print_current_mems_allowed();
 
 	dump_stack();
 	warn_alloc_show_mem(gfp_mask);
@@ -3724,7 +3726,7 @@ retry:
 
 	/* Make sure we know about allocations which stall for too long */
 	if (time_after(jiffies, alloc_start + stall_timeout)) {
-		warn_alloc(gfp_mask,
+		warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation stalls for %ums, order:%u",
 			jiffies_to_msecs(jiffies-alloc_start), order);
 		stall_timeout += 10 * HZ;
@@ -3775,7 +3777,7 @@ nopage:
 	if (read_mems_allowed_retry(cpuset_mems_cookie))
 		goto retry_cpuset;
 
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
 	return page;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5f5b09e9dccd..d89034a393f2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1662,7 +1662,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return area->addr;
 
 fail:
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
 			  (area->nr_pages*PAGE_SIZE), area->size);
 	vfree(area->addr);
@@ -1724,7 +1724,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask,
+	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
 	return NULL;
 }
-- 
cgit v1.2.3


From 902592448cc556751365de991dcf1a7eed78a98d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 8 Feb 2017 10:26:45 +1100
Subject: lib/show_mem.c: teach show_mem to work with the given nodemask

show_mem() allows to filter out node specific data which is irrelevant to
the allocation request via SHOW_MEM_FILTER_NODES.  The filtering is done
in skip_free_areas_node which skips all nodes which are not in the
mems_allowed of the current process.  This works most of the time as
expected because the nodemask shouldn't be outside of the allocating task
but there are some exceptions.  E.g.  memory hotplug might want to request
allocations from outside of the allowed nodes (see new_node_page).

Get rid of this hardcoded behavior and push the allocation mask down the
show_mem path and use it instead of cpuset_current_mems_allowed.  NULL
nodemask is interpreted as cpuset_current_mems_allowed.

Link: http://lkml.kernel.org/r/20170117091543.25850-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/xmon/xmon.c            |  2 +-
 arch/sparc/kernel/setup_32.c        |  2 +-
 drivers/net/ethernet/sgi/ioc3-eth.c |  2 +-
 drivers/tty/sysrq.c                 |  2 +-
 drivers/tty/vt/keyboard.c           |  2 +-
 include/linux/mm.h                  |  5 ++---
 lib/show_mem.c                      |  4 ++--
 mm/nommu.c                          |  6 +++---
 mm/oom_kill.c                       |  2 +-
 mm/page_alloc.c                     | 38 ++++++++++++++++++-------------------
 10 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9c0e17cf6886..01ba3e90d314 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -916,7 +916,7 @@ cmds(struct pt_regs *excp)
 				memzcan();
 				break;
 			case 'i':
-				show_mem(0);
+				show_mem(0, NULL);
 				break;
 			default:
 				termch = cmd;
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index c4e65cb3280f..6f06058c5ae7 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -82,7 +82,7 @@ static void prom_sync_me(void)
 			     "nop\n\t" : : "r" (&trapbase));
 
 	prom_printf("PROM SYNC COMMAND...\n");
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	if (!is_idle_task(current)) {
 		local_irq_enable();
 		sys_sync();
diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index d390b9663dc3..57e6cef81ebe 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev)
 
 			skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
 			if (!skb) {
-				show_free_areas(0);
+				show_free_areas(0, NULL);
 				continue;
 			}
 
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 701c085bb19b..71136742e606 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
 
 static void sysrq_handle_showmem(int key)
 {
-	show_mem(0);
+	show_mem(0, NULL);
 }
 static struct sysrq_key_op sysrq_showmem_op = {
 	.handler	= sysrq_handle_showmem,
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index 3dd6a491cdba..397e1509fe51 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc)
 
 static void fn_show_mem(struct vc_data *vc)
 {
-	show_mem(0);
+	show_mem(0, NULL);
 }
 
 static void fn_show_state(struct vc_data *vc)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c021e39ab0e6..a75f63a585d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1148,8 +1148,7 @@ extern void pagefault_out_of_memory(void);
  */
 #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
 
-extern void show_free_areas(unsigned int flags);
-extern bool skip_free_areas_node(unsigned int flags, int nid);
+extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
 
 int shmem_zero_setup(struct vm_area_struct *);
 #ifdef CONFIG_SHMEM
@@ -1930,7 +1929,7 @@ extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
-extern void show_mem(unsigned int flags);
+extern void show_mem(unsigned int flags, nodemask_t *nodemask);
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 1feed6a2b12a..0beaa1d899aa 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -9,13 +9,13 @@
 #include <linux/quicklist.h>
 #include <linux/cma.h>
 
-void show_mem(unsigned int filter)
+void show_mem(unsigned int filter, nodemask_t *nodemask)
 {
 	pg_data_t *pgdat;
 	unsigned long total = 0, reserved = 0, highmem = 0;
 
 	printk("Mem-Info:\n");
-	show_free_areas(filter);
+	show_free_areas(filter, nodemask);
 
 	for_each_online_pgdat(pgdat) {
 		unsigned long flags;
diff --git a/mm/nommu.c b/mm/nommu.c
index 24f9f5f39145..bc964c26be8c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1191,7 +1191,7 @@ error_free:
 enomem:
 	pr_err("Allocation of length %lu from process %d (%s) failed\n",
 	       len, current->pid, current->comm);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 }
 
@@ -1412,13 +1412,13 @@ error_getting_vma:
 	kmem_cache_free(vm_region_jar, region);
 	pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
 			len, current->pid);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 
 error_getting_region:
 	pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
 			len, current->pid);
-	show_free_areas(0);
+	show_free_areas(0, NULL);
 	return -ENOMEM;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ec9f11d4f094..7176b6a754cf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 	if (oc->memcg)
 		mem_cgroup_print_oom_info(oc->memcg, p);
 	else
-		show_mem(SHOW_MEM_FILTER_NODES);
+		show_mem(SHOW_MEM_FILTER_NODES, nm);
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(oc->memcg, oc->nodemask);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96c8fe602dfb..f127dd94a78b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3005,7 +3005,7 @@ static inline bool should_suppress_show_mem(void)
 	return ret;
 }
 
-static void warn_alloc_show_mem(gfp_t gfp_mask)
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
@@ -3025,7 +3025,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask)
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 
-	show_mem(filter);
+	show_mem(filter, nodemask);
 }
 
 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -3052,7 +3052,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 	cpuset_print_current_mems_allowed();
 
 	dump_stack();
-	warn_alloc_show_mem(gfp_mask);
+	warn_alloc_show_mem(gfp_mask, nm);
 }
 
 static inline struct page *
@@ -4274,20 +4274,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
 {
-	bool ret = false;
-	unsigned int cpuset_mems_cookie;
-
 	if (!(flags & SHOW_MEM_FILTER_NODES))
-		goto out;
+		return false;
 
-	do {
-		cpuset_mems_cookie = read_mems_allowed_begin();
-		ret = !node_isset(nid, cpuset_current_mems_allowed);
-	} while (read_mems_allowed_retry(cpuset_mems_cookie));
-out:
-	return ret;
+	/*
+	 * no node mask - aka implicit memory numa policy. Do not bother with the
+	 * synchronization - read_mems_allowed_begin - because we do not have to be
+	 * precise here.
+	 */
+	if (!nodemask)
+		nodemask = &cpuset_current_mems_allowed;
+
+	return !node_isset(nid, *nodemask);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4328,7 +4328,7 @@ static void show_migration_types(unsigned char type)
  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
  *   cpuset.
  */
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
 	unsigned long free_pcp = 0;
 	int cpu;
@@ -4336,7 +4336,7 @@ void show_free_areas(unsigned int filter)
 	pg_data_t *pgdat;
 
 	for_each_populated_zone(zone) {
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 
 		for_each_online_cpu(cpu)
@@ -4370,7 +4370,7 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_online_pgdat(pgdat) {
-		if (skip_free_areas_node(filter, pgdat->node_id))
+		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
 			continue;
 
 		printk("Node %d"
@@ -4422,7 +4422,7 @@ void show_free_areas(unsigned int filter)
 	for_each_populated_zone(zone) {
 		int i;
 
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 
 		free_pcp = 0;
@@ -4487,7 +4487,7 @@ void show_free_areas(unsigned int filter)
 		unsigned long nr[MAX_ORDER], flags, total = 0;
 		unsigned char types[MAX_ORDER];
 
-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 		show_node(zone);
 		printk(KERN_CONT "%s: ", zone->name);
-- 
cgit v1.2.3


From f348f255e406a24440f2b19cf30488366d8d3e1a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:53 +1100
Subject: mm: drop zap_details::ignore_dirty

The only user of ignore_dirty is oom-reaper.  But it doesn't really use
it.

ignore_dirty only has effect on file pages mapped with dirty pte.  But
oom-repear skips shared VMAs, so there's no way we can dirty file pte in
them.

Link: http://lkml.kernel.org/r/20170118122429.43661-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 6 ------
 mm/oom_kill.c      | 3 +--
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a75f63a585d0..f21cc80a9b8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1171,7 +1171,6 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	bool ignore_dirty;			/* Ignore dirty pages */
 	bool check_swap_entries;		/* Check also swap entries */
 };
 
diff --git a/mm/memory.c b/mm/memory.c
index d7676a68c80a..88872a93c3ca 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1155,12 +1155,6 @@ again:
 
 			if (!PageAnon(page)) {
 				if (pte_dirty(ptent)) {
-					/*
-					 * oom_reaper cannot tear down dirty
-					 * pages
-					 */
-					if (unlikely(details && details->ignore_dirty))
-						continue;
 					force_flush = 1;
 					set_page_dirty(page);
 				}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c7b48b4282d9..33fcc8a40aeb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -465,8 +465,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	struct zap_details details = {.check_swap_entries = true,
-				      .ignore_dirty = true};
+	struct zap_details details = {.check_swap_entries = true};
 	bool ret = true;
 
 	/*
-- 
cgit v1.2.3


From 41e6945f560a328e6736f6456e9757a96024a786 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:54 +1100
Subject: mm: drop zap_details::check_swap_entries

detail == NULL would give the same functionality as
.check_swap_entries==true.

Link: http://lkml.kernel.org/r/20170118122429.43661-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 4 ++--
 mm/oom_kill.c      | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f21cc80a9b8f..3dc700e834c6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1171,7 +1171,6 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
-	bool check_swap_entries;		/* Check also swap entries */
 };
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 88872a93c3ca..e9035a0afee2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1173,8 +1173,8 @@ again:
 			}
 			continue;
 		}
-		/* only check swap_entries if explicitly asked for in details */
-		if (unlikely(details && !details->check_swap_entries))
+		/* If details->check_mapping, we leave swap entries. */
+		if (unlikely(details))
 			continue;
 
 		entry = pte_to_swp_entry(ptent);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 33fcc8a40aeb..a1977247c7ea 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -465,7 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	struct zap_details details = {.check_swap_entries = true};
 	bool ret = true;
 
 	/*
@@ -531,7 +530,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		 */
 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
 			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-					 &details);
+					 NULL);
 	}
 	tlb_finish_mmu(&tlb, 0, -1);
 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
-- 
cgit v1.2.3


From f36951b01850333725cb584471100c690b7bf684 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 8 Feb 2017 10:26:55 +1100
Subject: mm: drop unused argument of zap_page_range()

There's no users of zap_page_range() who wants non-NULL 'details'.  Let's
drop it.

Link: http://lkml.kernel.org/r/20170118122429.43661-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/gmap.c               | 2 +-
 arch/x86/mm/mpx.c                 | 2 +-
 drivers/android/binder.c          | 2 +-
 drivers/staging/android/ion/ion.c | 3 +--
 include/linux/mm.h                | 2 +-
 mm/madvise.c                      | 2 +-
 mm/memory.c                       | 5 ++---
 7 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index ec1f0dedb948..59ac93714fa4 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -687,7 +687,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 		/* Find vma in the parent mm */
 		vma = find_vma(gmap->mm, vmaddr);
 		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
-		zap_page_range(vma, vmaddr, size, NULL);
+		zap_page_range(vma, vmaddr, size);
 	}
 	up_read(&gmap->mm->mmap_sem);
 }
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index af59f808742f..aad4ac386f98 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
 			return -EINVAL;
 
 		len = min(vma->vm_end, end) - addr;
-		zap_page_range(vma, addr, len, NULL);
+		zap_page_range(vma, addr, len);
 		trace_mpx_unmap_zap(addr, addr+len);
 
 		vma = vma->vm_next;
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3c71b982bf2a..d97f6725cf8c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -629,7 +629,7 @@ free_range:
 		page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
 		if (vma)
 			zap_page_range(vma, (uintptr_t)page_addr +
-				proc->user_buffer_offset, PAGE_SIZE, NULL);
+				proc->user_buffer_offset, PAGE_SIZE);
 err_vm_insert_page_failed:
 		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
 err_map_kernel_failed:
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index b653451843c8..0fb0e28ace70 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -865,8 +865,7 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 	list_for_each_entry(vma_list, &buffer->vmas, list) {
 		struct vm_area_struct *vma = vma_list->vma;
 
-		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start,
-			       NULL);
+		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 	}
 	mutex_unlock(&buffer->lock);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dc700e834c6..f317d308db3a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1181,7 +1181,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
-		unsigned long size, struct zap_details *);
+		unsigned long size);
 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long start, unsigned long end);
 
diff --git a/mm/madvise.c b/mm/madvise.c
index ca75b8a01ba0..7f1490f0d3a6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -478,7 +478,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 		return -EINVAL;
 
 	madvise_userfault_dontneed(vma, prev, start, end);
-	zap_page_range(vma, start, end - start, NULL);
+	zap_page_range(vma, start, end - start);
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index e9035a0afee2..7663068a33c6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1370,12 +1370,11 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
-		unsigned long size, struct zap_details *details)
+		unsigned long size)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
@@ -1386,7 +1385,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 	update_hiwater_rss(mm);
 	mmu_notifier_invalidate_range_start(mm, start, end);
 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
-		unmap_single_vma(&tlb, vma, start, end, details);
+		unmap_single_vma(&tlb, vma, start, end, NULL);
 	mmu_notifier_invalidate_range_end(mm, start, end);
 	tlb_finish_mmu(&tlb, start, end);
 }
-- 
cgit v1.2.3


From e5ff4d3d9ab83a3283854fbb7565a6a21764cd3e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 8 Feb 2017 10:26:57 +1100
Subject: mm: fix <linux/pagemap.h> stray kernel-doc notation

Delete stray (second) function description in find_lock_page() kernel-doc
notation.

Note: scripts/kernel-doc just ignores the second function description.

Fixes: 2457aec63745e ("mm: non-atomically mark page accessed during page cache allocation where possible")
Link: http://lkml.kernel.org/r/b037e9a3-516c-ec02-6c8e-fa5479747ba6@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b572f5530392..84943e8057ef 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -266,7 +266,6 @@ static inline struct page *find_get_page_flags(struct address_space *mapping,
 
 /**
  * find_lock_page - locate, pin and lock a pagecache page
- * pagecache_get_page - find and get a page reference
  * @mapping: the address_space to search
  * @offset: the page index
  *
-- 
cgit v1.2.3


From 91574eceaad7a937da6edc5ee31b21b87934838d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:00 +1100
Subject: mm: fix type width of section to/from pfn conversion macros

Patch series "mm: sub-section memory hotplug support", v3.

The initial motivation for this change is persistent memory platforms
that, unfortunately, align the pmem range on a boundary less than a full
section (64M vs 128M), and may change the alignment from one boot to the
next.  A secondary motivation is the arrival of prospective ZONE_DEVICE
users that want devm_memremap_pages() to map PCI-E device memory ranges to
enable peer-to-peer DMA.  There is a range of possible physical address
alignments of PCI-E BARs that are less than 128M.

Currently the libnvdimm core injects padding when 'pfn' (struct page
mapping configuration) instances are created.  However, not all users of
devm_memremap_pages() have the opportunity to inject such padding.  Users
of the memmap=ss!nn kernel command line option can trigger the following
failure with unaligned parameters like "memmap=0xfc000000!8G":

 WARNING: CPU: 0 PID: 558 at kernel/memremap.c:300
 devm_memremap_pages attempted on mixed region [mem 0x200000000-0x2fbffffff flags 0x200]
 [..]
 Call Trace:
  [<ffffffff814c0393>] dump_stack+0x86/0xc3
  [<ffffffff810b173b>] __warn+0xcb/0xf0
  [<ffffffff810b17bf>] warn_slowpath_fmt+0x5f/0x80
  [<ffffffff811eb105>] devm_memremap_pages+0x3b5/0x4c0
  [<ffffffffa006f308>] __wrap_devm_memremap_pages+0x58/0x70 [nfit_test_iomap]
  [<ffffffffa00e231a>] pmem_attach_disk+0x19a/0x440 [nd_pmem]

Without this change a user could inadvertently lose access to nvdimm
namespaces after a configuration change.  The act of adding, removing, or
rearranging DIMMs in the platform could lead to the BIOS changing the base
alignment of the namespace in an incompatible fashion.  With this support
we can accommodate a BIOS changing the namespace to any alignment provided
it is >= SECTION_ACTIVE_SIZE.

In other words, we are protecting against misalignment injected by the
BIOS after the libnvdimm sub-system already recorded that the namespace
does not need alignment padding.  In that case the user would need to
figure out how to undo the configuration change to regain access to their
nvdimm capacity.

This patch (of 12):

section_nr_to_pfn() will silently accept an argument that is too small to
contain a pfn.  Cast the argument to an unsigned long, similar to
PFN_PHYS().  Fix up pfn_to_section_nr() in the same way.

This was discovered in __add_pages() when converting it to use an signed
integer for the loop variable.

Link: http://lkml.kernel.org/r/148486360180.19694.6303499052926089764.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++--
 mm/memory_hotplug.c    | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 82fc632fd11d..68e8bec6a4be 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1066,8 +1066,8 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #error Allocator MAX_ORDER exceeds SECTION_SIZE
 #endif
 
-#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
-#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
+#define pfn_to_section_nr(pfn) ((unsigned long)(pfn) >> PFN_SECTION_SHIFT)
+#define section_nr_to_pfn(sec) ((unsigned long)(sec) << PFN_SECTION_SHIFT)
 
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d67787d10ff0..7858d0df2da7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -529,9 +529,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 			unsigned long nr_pages)
 {
-	unsigned long i;
-	int err = 0;
-	int start_sec, end_sec;
+	int err = 0, i, start_sec, end_sec;
 	struct vmem_altmap *altmap;
 
 	clear_zone_contiguous(zone);
-- 
cgit v1.2.3


From 7b96d11c60a1844fbb6dd91dc97d9044a856d518 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:01 +1100
Subject: mm: introduce struct mem_section_usage to track partial population of
 a section

'struct mem_section_usage' combines the existing 'pageblock_flags' bitmap
with a new 'map_active' bitmap.  The new bitmap enables the memory
hot{plug,remove} implementation to act on incremental sub-divisions of a
section.  The primary impetus for this functionality is to support
platforms that mix "System RAM" and "Persistent Memory" within a single
section.  We want to be able to hotplug "Persistent Memory" to extend a
partially populated section and share that section between ZONE_DEVICE and
ZONE_NORMAL/MOVABLE memory.

This introduces a pointer to the new 'map_active' bitmap through struct
mem_section, but otherwise should not change any behavior.

Link: http://lkml.kernel.org/r/148486361735.19694.17147399375362860739.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 21 +++++++++--
 mm/memory_hotplug.c    |  4 +--
 mm/page_alloc.c        |  2 +-
 mm/sparse.c            | 98 +++++++++++++++++++++++++++-----------------------
 4 files changed, 75 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 68e8bec6a4be..5b5a6dc2df05 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1072,6 +1072,19 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
 
+#define SECTION_ACTIVE_SIZE ((1UL << SECTION_SIZE_BITS) / BITS_PER_LONG)
+#define SECTION_ACTIVE_MASK (~(SECTION_ACTIVE_SIZE - 1))
+
+struct mem_section_usage {
+	/*
+	 * SECTION_ACTIVE_SIZE portions of the section that are populated in
+	 * the memmap
+	 */
+	unsigned long map_active;
+	/* See declaration of similar field in struct zone */
+	unsigned long pageblock_flags[0];
+};
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1089,8 +1102,7 @@ struct mem_section {
 	 */
 	unsigned long section_mem_map;
 
-	/* See declaration of similar field in struct zone */
-	unsigned long *pageblock_flags;
+	struct mem_section_usage *usage;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
 	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
@@ -1121,6 +1133,11 @@ extern struct mem_section *mem_section[NR_SECTION_ROOTS];
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
+static inline unsigned long *section_to_usemap(struct mem_section *ms)
+{
+	return ms->usage->pageblock_flags;
+}
+
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7858d0df2da7..29f7a7ce7a34 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -228,7 +228,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, SECTION_INFO);
 
-	usemap = __nr_to_section(section_nr)->pageblock_flags;
+	usemap = section_to_usemap(__nr_to_section(section_nr));
 	page = virt_to_page(usemap);
 
 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -254,7 +254,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 
-	usemap = __nr_to_section(section_nr)->pageblock_flags;
+	usemap = section_to_usemap(__nr_to_section(section_nr));
 	page = virt_to_page(usemap);
 
 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c21b33668133..372f28f0030f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -353,7 +353,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
+	return section_to_usemap(__pfn_to_section(pfn));
 #else
 	return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
diff --git a/mm/sparse.c b/mm/sparse.c
index db6bf3c97ea2..d0d4c005dc60 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -233,15 +233,15 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static int __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		unsigned long *pageblock_bitmap)
+		struct mem_section_usage *usage)
 {
 	if (!present_section(ms))
 		return -EINVAL;
 
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
-							SECTION_HAS_MEM_MAP;
- 	ms->pageblock_flags = pageblock_bitmap;
+		SECTION_HAS_MEM_MAP;
+	ms->usage = usage;
 
 	return 1;
 }
@@ -255,9 +255,13 @@ unsigned long usemap_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+static struct mem_section_usage *__alloc_section_usage(void)
 {
-	return kmalloc(usemap_size(), GFP_KERNEL);
+	struct mem_section_usage *usage;
+
+	usage = kzalloc(sizeof(*usage) + usemap_size(), GFP_KERNEL);
+	/* TODO: allocate the map_active bitmap */
+	return usage;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
@@ -293,7 +297,8 @@ again:
 	return p;
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 	unsigned long usemap_snr, pgdat_snr;
 	static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
@@ -301,7 +306,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	int usemap_nid;
 
-	usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
 	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
 	if (usemap_snr == pgdat_snr)
 		return;
@@ -336,7 +341,8 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -344,26 +350,27 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 static void __init sparse_early_usemaps_alloc_node(void *data,
 				 unsigned long pnum_begin,
 				 unsigned long pnum_end,
-				 unsigned long usemap_count, int nodeid)
+				 unsigned long usage_count, int nodeid)
 {
-	void *usemap;
+	void *usage;
 	unsigned long pnum;
-	unsigned long **usemap_map = (unsigned long **)data;
-	int size = usemap_size();
+	struct mem_section_usage **usage_map = data;
+	int size = sizeof(struct mem_section_usage) + usemap_size();
 
-	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
-							  size * usemap_count);
-	if (!usemap) {
+	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
+							  size * usage_count);
+	if (!usage) {
 		pr_warn("%s: allocation failed\n", __func__);
 		return;
 	}
 
+	memset(usage, 0, size * usage_count);
 	for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
 		if (!present_section_nr(pnum))
 			continue;
-		usemap_map[pnum] = usemap;
-		usemap += size;
-		check_usemap_section_nr(nodeid, usemap_map[pnum]);
+		usage_map[pnum] = usage;
+		usage += size;
+		check_usemap_section_nr(nodeid, usage_map[pnum]);
 	}
 }
 
@@ -468,7 +475,7 @@ void __weak __meminit vmemmap_populate_print_last(void)
 
 /**
  *  alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
- *  @map: usemap_map for pageblock flags or mmap_map for vmemmap
+ *  @map: usage_map for mem_section_usage or mmap_map for vmemmap
  */
 static void __init alloc_usemap_and_memmap(void (*alloc_func)
 					(void *, unsigned long, unsigned long,
@@ -521,10 +528,9 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
  */
 void __init sparse_init(void)
 {
+	struct mem_section_usage *usage, **usage_map;
 	unsigned long pnum;
 	struct page *map;
-	unsigned long *usemap;
-	unsigned long **usemap_map;
 	int size;
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
 	int size2;
@@ -539,21 +545,21 @@ void __init sparse_init(void)
 
 	/*
 	 * map is using big page (aka 2M in x86 64 bit)
-	 * usemap is less one page (aka 24 bytes)
+	 * usage is less one page (aka 24 bytes)
 	 * so alloc 2M (with 2M align) and 24 bytes in turn will
 	 * make next 2M slip to one more 2M later.
 	 * then in big system, the memory will have a lot of holes...
 	 * here try to allocate 2M pages continuously.
 	 *
 	 * powerpc need to call sparse_init_one_section right after each
-	 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+	 * sparse_early_mem_map_alloc, so allocate usage_map at first.
 	 */
-	size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
-	usemap_map = memblock_virt_alloc(size, 0);
-	if (!usemap_map)
-		panic("can not allocate usemap_map\n");
+	size = sizeof(struct mem_section_usage *) * NR_MEM_SECTIONS;
+	usage_map = memblock_virt_alloc(size, 0);
+	if (!usage_map)
+		panic("can not allocate usage_map\n");
 	alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
-							(void *)usemap_map);
+							(void *)usage_map);
 
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
 	size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
@@ -568,8 +574,8 @@ void __init sparse_init(void)
 		if (!present_section_nr(pnum))
 			continue;
 
-		usemap = usemap_map[pnum];
-		if (!usemap)
+		usage = usage_map[pnum];
+		if (!usage)
 			continue;
 
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
@@ -581,7 +587,7 @@ void __init sparse_init(void)
 			continue;
 
 		sparse_init_one_section(__nr_to_section(pnum), pnum, map,
-								usemap);
+								usage);
 	}
 
 	vmemmap_populate_print_last();
@@ -589,7 +595,7 @@ void __init sparse_init(void)
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
 	memblock_free_early(__pa(map_map), size2);
 #endif
-	memblock_free_early(__pa(usemap_map), size);
+	memblock_free_early(__pa(usage_map), size);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -693,9 +699,9 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
 	struct pglist_data *pgdat = zone->zone_pgdat;
+	static struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
-	unsigned long *usemap;
 	unsigned long flags;
 	int ret;
 
@@ -709,8 +715,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
 	memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
 	if (!memmap)
 		return -ENOMEM;
-	usemap = __kmalloc_section_usemap();
-	if (!usemap) {
+	usage = __alloc_section_usage();
+	if (!usage) {
 		__kfree_section_memmap(memmap);
 		return -ENOMEM;
 	}
@@ -727,12 +733,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
 
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 
-	ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
+	ret = sparse_init_one_section(ms, section_nr, memmap, usage);
 
 out:
 	pgdat_resize_unlock(pgdat, &flags);
 	if (ret <= 0) {
-		kfree(usemap);
+		kfree(usage);
 		__kfree_section_memmap(memmap);
 	}
 	return ret;
@@ -760,19 +766,20 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+static void free_section_usage(struct page *memmap,
+		struct mem_section_usage *usage)
 {
 	struct page *usemap_page;
 
-	if (!usemap)
+	if (!usage)
 		return;
 
-	usemap_page = virt_to_page(usemap);
+	usemap_page = virt_to_page(usage->pageblock_flags);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
 	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-		kfree(usemap);
+		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap);
 		return;
@@ -790,23 +797,24 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		unsigned long map_offset)
 {
+	unsigned long flags;
 	struct page *memmap = NULL;
-	unsigned long *usemap = NULL, flags;
+	struct mem_section_usage *usage = NULL;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 
 	pgdat_resize_lock(pgdat, &flags);
 	if (ms->section_mem_map) {
-		usemap = ms->pageblock_flags;
+		usage = ms->usage;
 		memmap = sparse_decode_mem_map(ms->section_mem_map,
 						__section_nr(ms));
 		ms->section_mem_map = 0;
-		ms->pageblock_flags = NULL;
+		ms->usage = NULL;
 	}
 	pgdat_resize_unlock(pgdat, &flags);
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usemap(memmap, usemap);
+	free_section_usage(memmap, usage);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From 86986cd785bec7f840e763487c4dc1bf614002d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:02 +1100
Subject: mm: introduce common definitions for the size and mask of a section

Up-level the local section size and mask from kernel/memremap.c to global
definitions.  These will be used by the new sub-section hotplug support.

Link: http://lkml.kernel.org/r/148486362333.19694.14241079169560958896.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 kernel/memremap.c      | 10 ++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5b5a6dc2df05..e772509af56c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1052,6 +1052,8 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
  * PFN_SECTION_SHIFT		pfn to/from section number
  */
 #define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
+#define PA_SECTION_SIZE		(1UL << PA_SECTION_SHIFT)
+#define PA_SECTION_MASK		(~(PA_SECTION_SIZE-1))
 #define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
 
 #define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index e0a2de6a168b..e58d7828ff9a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -172,8 +172,6 @@ EXPORT_SYMBOL(devm_memunmap);
 #ifdef CONFIG_ZONE_DEVICE
 static DEFINE_MUTEX(pgmap_lock);
 static RADIX_TREE(pgmap_radix, GFP_KERNEL);
-#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
-#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
 struct page_map {
 	struct resource res;
@@ -267,8 +265,8 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	}
 
 	/* pages are dead and unused, undo the arch mapping */
-	align_start = res->start & ~(SECTION_SIZE - 1);
-	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+	align_start = res->start & PA_SECTION_MASK;
+	align_size = ALIGN(resource_size(res), PA_SECTION_SIZE);
 	mem_hotplug_begin();
 	arch_remove_memory(align_start, align_size);
 	mem_hotplug_done();
@@ -314,8 +312,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	struct page_map *page_map;
 	int error, nid, is_ram;
 
-	align_start = res->start & ~(SECTION_SIZE - 1);
-	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+	align_start = res->start & PA_SECTION_MASK;
+	align_size = ALIGN(res->start + resource_size(res), PA_SECTION_SIZE)
 		- align_start;
 	is_ram = region_intersects(align_start, align_size,
 		IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-- 
cgit v1.2.3


From 310e49cc769903927212ef49cfde709eace8aaf0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:02 +1100
Subject: mm: track active portions of a section at boot

Prepare for hot{plug,remove} of sub-ranges of a section by tracking a
section active bitmask, each bit representing 2MB (SECTION_SIZE (128M) /
map_active bitmask length (64)).

Link: http://lkml.kernel.org/r/148486363375.19694.14661926204436340901.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  3 +++
 mm/page_alloc.c        |  4 +++-
 mm/sparse.c            | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e772509af56c..5c2eaa5a5d53 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1087,6 +1087,8 @@ struct mem_section_usage {
 	unsigned long pageblock_flags[0];
 };
 
+void section_active_init(unsigned long pfn, unsigned long nr_pages);
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1228,6 +1230,7 @@ void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
+#define section_active_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 372f28f0030f..2c24bbd31bc5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6454,10 +6454,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 
 	/* Print out the early node map */
 	pr_info("Early memory node ranges\n");
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
 			(u64)start_pfn << PAGE_SHIFT,
 			((u64)end_pfn << PAGE_SHIFT) - 1);
+		section_active_init(start_pfn, end_pfn - start_pfn);
+	}
 
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
diff --git a/mm/sparse.c b/mm/sparse.c
index 886f666ebe35..2265578eedbb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -168,6 +168,59 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 	}
 }
 
+static int section_active_index(phys_addr_t phys)
+{
+	return (phys & ~(PA_SECTION_MASK)) / SECTION_ACTIVE_SIZE;
+}
+
+static unsigned long section_active_mask(unsigned long pfn,
+		unsigned long nr_pages)
+{
+	int idx_start, idx_size;
+	phys_addr_t start, size;
+
+	if (!nr_pages)
+		return 0;
+
+	start = PFN_PHYS(pfn);
+	size = PFN_PHYS(min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK)));
+	size = ALIGN(size, SECTION_ACTIVE_SIZE);
+
+	idx_start = section_active_index(start);
+	idx_size = section_active_index(size);
+
+	if (idx_size == 0)
+		return -1;
+	return ((1UL << idx_size) - 1) << idx_start;
+}
+
+void section_active_init(unsigned long pfn, unsigned long nr_pages)
+{
+	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	int i, start_sec = pfn_to_section_nr(pfn);
+
+	if (!nr_pages)
+		return;
+
+	for (i = start_sec; i <= end_sec; i++) {
+		struct mem_section *ms;
+		unsigned long mask;
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		mask = section_active_mask(pfn, pfns);
+
+		ms = __nr_to_section(i);
+		pr_debug("%s: sec: %d mask: %#018lx\n", __func__, i, mask);
+		ms->usage->map_active = mask;
+
+		pfn += pfns;
+		nr_pages -= pfns;
+	}
+}
+
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From 1b2d0af7f2127e2b682c6ab56752c33654deedd5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 8 Feb 2017 10:27:03 +1100
Subject: mm-track-active-portions-of-a-section-at-boot-fix

make section_active_init() init, use static inline for stub

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h | 6 +++++-
 mm/sparse.c            | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5c2eaa5a5d53..df992831fde7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1087,7 +1087,7 @@ struct mem_section_usage {
 	unsigned long pageblock_flags[0];
 };
 
-void section_active_init(unsigned long pfn, unsigned long nr_pages);
+void __init section_active_init(unsigned long pfn, unsigned long nr_pages);
 
 struct page;
 struct page_ext;
@@ -1230,6 +1230,10 @@ void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
+static inline void section_active_init(unsigned long pfn,
+				       unsigned long nr_pages)
+{
+}
 #define section_active_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 2265578eedbb..2c1d962f00e3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -168,13 +168,13 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 	}
 }
 
-static int section_active_index(phys_addr_t phys)
+static int __init section_active_index(phys_addr_t phys)
 {
 	return (phys & ~(PA_SECTION_MASK)) / SECTION_ACTIVE_SIZE;
 }
 
-static unsigned long section_active_mask(unsigned long pfn,
-		unsigned long nr_pages)
+static unsigned long __init section_active_mask(unsigned long pfn,
+						unsigned long nr_pages)
 {
 	int idx_start, idx_size;
 	phys_addr_t start, size;
@@ -195,7 +195,7 @@ static unsigned long section_active_mask(unsigned long pfn,
 	return ((1UL << idx_size) - 1) << idx_start;
 }
 
-void section_active_init(unsigned long pfn, unsigned long nr_pages)
+void __init section_active_init(unsigned long pfn, unsigned long nr_pages)
 {
 	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 	int i, start_sec = pfn_to_section_nr(pfn);
-- 
cgit v1.2.3


From 835b9624109a97628fad1633b6880730f170d36f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:04 +1100
Subject: mm: fix register_new_memory() zone type detection

In preparation for sub-section memory hotplug support, remove a dependency
on ->section_mem_map being populated.  In SPARSEMEM_VMEMMAP=y
configurations pfn_to_page() does not use ->section_mem_map.  The
sub-section hotplug support relies on this fact and skips initializing it.
Without ->section_mem_map populated, or aligned to section boundary,
conversions of mem_section instances to zones is not possible.

So, this removes a false dependency on a structure field that will only be
valid in the SPARSEMEM_VMEMMAP=n case, and only used for pfn_to_page()
(and similar) operations.

Link: http://lkml.kernel.org/r/148486363923.19694.16803321290568634521.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/memory.c  | 26 +++++++++-----------------
 include/linux/memory.h |  4 ++--
 mm/memory_hotplug.c    |  4 ++--
 3 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index fa26ffd25fa6..fb381464e9cb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -685,24 +685,16 @@ static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
-static bool is_zone_device_section(struct mem_section *ms)
-{
-	struct page *page;
-
-	page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms));
-	return is_zone_device_page(page);
-}
-
 /*
  * need an interface for the VM to add new memory regions,
  * but without onlining it.
  */
-int register_new_memory(int nid, struct mem_section *section)
+int register_new_memory(struct zone *zone, int nid, struct mem_section *section)
 {
 	int ret = 0;
 	struct memory_block *mem;
 
-	if (is_zone_device_section(section))
+	if (is_dev_zone(zone))
 		return 0;
 
 	mutex_lock(&mem_sysfs_mutex);
@@ -736,14 +728,11 @@ unregister_memory(struct memory_block *memory)
 	device_unregister(&memory->dev);
 }
 
-static int remove_memory_section(unsigned long node_id,
-			       struct mem_section *section, int phys_device)
+static int remove_memory_section(struct zone *zone, unsigned long node_id,
+		struct mem_section *section, int phys_device)
 {
 	struct memory_block *mem;
 
-	if (is_zone_device_section(section))
-		return 0;
-
 	mutex_lock(&mem_sysfs_mutex);
 	mem = find_memory_block(section);
 	unregister_mem_sect_under_nodes(mem, __section_nr(section));
@@ -758,12 +747,15 @@ static int remove_memory_section(unsigned long node_id,
 	return 0;
 }
 
-int unregister_memory_section(struct mem_section *section)
+int unregister_memory_section(struct zone *zone, struct mem_section *section)
 {
+	if (is_dev_zone(zone))
+		return 0;
+
 	if (!present_section(section))
 		return -EINVAL;
 
-	return remove_memory_section(0, section, 0);
+	return remove_memory_section(zone, 0, section, 0);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 093607f90b91..301dfb03ecb7 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -108,12 +108,12 @@ extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
-extern int register_new_memory(int, struct mem_section *);
+extern int register_new_memory(struct zone *, int, struct mem_section *);
 extern int memory_block_change_state(struct memory_block *mem,
 				     unsigned long to_state,
 				     unsigned long from_state_req);
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern int unregister_memory_section(struct mem_section *);
+extern int unregister_memory_section(struct zone *, struct mem_section *);
 #endif
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 29f7a7ce7a34..85e158b8604d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -517,7 +517,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
 	if (ret < 0)
 		return ret;
 
-	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
+	return register_new_memory(zone, nid, __pfn_to_section(phys_start_pfn));
 }
 
 /*
@@ -784,7 +784,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
 	if (!valid_section(ms))
 		return ret;
 
-	ret = unregister_memory_section(ms);
+	ret = unregister_memory_section(zone, ms);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 2eaeb90871fc43a24d95d0822de463a34092d3be Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:04 +1100
Subject: mm: convert kmalloc_section_memmap() to populate_section_memmap()

Allow sub-section sized ranges to be added to the memmap.
populate_section_memmap() takes an explict pfn range rather than assuming
a full section, and those parameters are plumbed all the way through to
vmmemap_populate().  There should be no sub-section in current code.  New
warnings are added to clarify which memmap allocation paths are
sub-section capable.

Link: http://lkml.kernel.org/r/148486364441.19694.15498867968502173386.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/init_64.c |  4 +++-
 include/linux/mm.h    |  3 ++-
 mm/sparse-vmemmap.c   | 24 +++++++++++++++------
 mm/sparse.c           | 60 +++++++++++++++++++++++++++++++++------------------
 4 files changed, 61 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 97346f987ef2..12954009c9eb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1220,7 +1220,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 	struct vmem_altmap *altmap = to_vmem_altmap(start);
 	int err;
 
-	if (boot_cpu_has(X86_FEATURE_PSE))
+	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+		err = vmemmap_populate_basepages(start, end, node);
+	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
 	else if (altmap) {
 		pr_err_once("%s: no cpu support for altmap allocations\n",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f317d308db3a..d8e585562cce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2345,7 +2345,8 @@ void sparse_mem_maps_populate_node(struct page **map_map,
 				   unsigned long map_count,
 				   int nodeid);
 
-struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
+struct page *__populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 574c67b663fe..8679d4a81b98 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -248,20 +248,28 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 	return 0;
 }
 
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid)
 {
 	unsigned long start;
 	unsigned long end;
-	struct page *map;
 
-	map = pfn_to_page(pnum * PAGES_PER_SECTION);
-	start = (unsigned long)map;
-	end = (unsigned long)(map + PAGES_PER_SECTION);
+	/*
+	 * The minimum granularity of memmap extensions is
+	 * SECTION_ACTIVE_SIZE as allocations are tracked in the
+	 * 'map_active' bitmap of the section.
+	 */
+	end = ALIGN(pfn + nr_pages, PHYS_PFN(SECTION_ACTIVE_SIZE));
+	pfn &= PHYS_PFN(SECTION_ACTIVE_MASK);
+	nr_pages = end - pfn;
+
+	start = (unsigned long) pfn_to_page(pfn);
+	end = start + nr_pages * sizeof(struct page);
 
 	if (vmemmap_populate(start, end, nid))
 		return NULL;
 
-	return map;
+	return pfn_to_page(pfn);
 }
 
 void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -284,11 +292,13 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 
 	for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
 		struct mem_section *ms;
+		unsigned long pfn = section_nr_to_pfn(pnum);
 
 		if (!present_section_nr(pnum))
 			continue;
 
-		map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+		map_map[pnum] = __populate_section_memmap(pfn,
+				PAGES_PER_SECTION, nodeid);
 		if (map_map[pnum])
 			continue;
 		ms = __nr_to_section(pnum);
diff --git a/mm/sparse.c b/mm/sparse.c
index c16fbaa5938f..2f571e371ec1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -423,7 +423,8 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
 }
 
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid)
 {
 	struct page *map;
 	unsigned long size;
@@ -475,10 +476,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 	/* fallback */
 	for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
 		struct mem_section *ms;
+		unsigned long pfn = section_nr_to_pfn(pnum);
 
 		if (!present_section_nr(pnum))
 			continue;
-		map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+		map_map[pnum] = __populate_section_memmap(pfn,
+				PAGES_PER_SECTION, nodeid);
 		if (map_map[pnum])
 			continue;
 		ms = __nr_to_section(pnum);
@@ -506,7 +509,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 	struct mem_section *ms = __nr_to_section(pnum);
 	int nid = sparse_early_nid(ms);
 
-	map = sparse_mem_map_populate(pnum, nid);
+	map = __populate_section_memmap(section_nr_to_pfn(pnum),
+			PAGES_PER_SECTION, nid);
 	if (map)
 		return map;
 
@@ -648,15 +652,16 @@ void __init sparse_init(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid)
 {
-	/* This will make the necessary allocations eventually. */
-	return sparse_mem_map_populate(pnum, nid);
+	return __populate_section_memmap(pfn, nr_pages, nid);
 }
-static void __kfree_section_memmap(struct page *memmap)
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages)
 {
-	unsigned long start = (unsigned long)memmap;
-	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+	unsigned long start = (unsigned long) pfn_to_page(pfn);
+	unsigned long end = start + nr_pages * sizeof(struct page);
 
 	vmemmap_free(start, end);
 }
@@ -670,11 +675,18 @@ static void free_map_bootmem(struct page *memmap)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid)
 {
 	struct page *page, *ret;
 	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
 
+	if ((pfn & ~PAGE_SECTION_MASK) || nr_pages != PAGES_PER_SECTION) {
+		WARN(1, "%s: called with section unaligned parameters\n",
+				__func__);
+		return NULL;
+	}
+
 	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
 	if (page)
 		goto got_map_page;
@@ -691,13 +703,16 @@ got_map_ptr:
 	return ret;
 }
 
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages)
 {
-	return __kmalloc_section_memmap();
-}
+	struct page *memmap = pfn_to_page(pfn);
+
+	if ((pfn & ~PAGE_SECTION_MASK) || nr_pages != PAGES_PER_SECTION) {
+		WARN(1, "%s: called with section unaligned parameters\n",
+				__func__);
+		return;
+	}
 
-static void __kfree_section_memmap(struct page *memmap)
-{
 	if (is_vmalloc_addr(memmap))
 		vfree(memmap);
 	else
@@ -755,12 +770,13 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
 	ret = sparse_index_init(section_nr, pgdat->node_id);
 	if (ret < 0 && ret != -EEXIST)
 		return ret;
-	memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
+	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION,
+			pgdat->node_id);
 	if (!memmap)
 		return -ENOMEM;
 	usage = __alloc_section_usage();
 	if (!usage) {
-		__kfree_section_memmap(memmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION);
 		return -ENOMEM;
 	}
 
@@ -782,7 +798,7 @@ out:
 	pgdat_resize_unlock(pgdat, &flags);
 	if (ret < 0 && ret != -EEXIST) {
 		kfree(usage);
-		__kfree_section_memmap(memmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION);
 		return ret;
 	}
 	return 0;
@@ -811,7 +827,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 #endif
 
 static void free_section_usage(struct page *memmap,
-		struct mem_section_usage *usage)
+		struct mem_section_usage *usage, unsigned long pfn,
+		unsigned long nr_pages)
 {
 	struct page *usemap_page;
 
@@ -825,7 +842,7 @@ static void free_section_usage(struct page *memmap,
 	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
 		kfree(usage);
 		if (memmap)
-			__kfree_section_memmap(memmap);
+			depopulate_section_memmap(pfn, nr_pages);
 		return;
 	}
 
@@ -858,7 +875,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(memmap, usage);
+	free_section_usage(memmap, usage, section_nr_to_pfn(__section_nr(ms)),
+			PAGES_PER_SECTION);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3


From b6e247c22c883134f63ab0a021d9c4a1f03294e2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 8 Feb 2017 10:27:05 +1100
Subject: mm: prepare for hot-{add, remove} of sub-section ranges

Prepare the memory hot-{add,remove} paths for handling sub-section ranges
by plumbing the starting page frame and number of pages being handled
through arch_{add,remove}_memory() to sparse_{add,remove}_one_section().

This is simply plumbing, small cleanups, and some identifier renames.  No
intended functional changes.

Link: http://lkml.kernel.org/r/148486364991.19694.14012559642668647057.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/init_64.c          | 11 ++++++
 include/linux/memory_hotplug.h |  6 ++-
 mm/memory_hotplug.c            | 85 +++++++++++++++++++++++-------------------
 mm/sparse.c                    |  6 ++-
 4 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 12954009c9eb..5fff9138c442 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -650,6 +650,17 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
+	/*
+	 * Only allow partial section hotplug for ZONE_DEVICE ranges,
+	 * since register_new_memory() requires section alignment, and
+	 * CONFIG_SPARSEMEM_VMEMMAP=n requires sections to be fully
+	 * populated.
+	 */
+	if ((!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) || !for_device)
+			&& ((start & ~PA_SECTION_MASK)
+				|| (size & ~PA_SECTION_MASK)))
+		return -EINVAL;
+
 	init_memory_mapping(start, start + size);
 
 	ret = __add_pages(nid, zone, start_pfn, nr_pages);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 134a2f69c21a..2f2c0d1290a1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -280,8 +280,10 @@ extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+extern int sparse_add_section(struct zone *zone, unsigned long pfn,
+		unsigned long nr_pages);
+extern void sparse_remove_section(struct zone *zone, struct mem_section *ms,
+		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 85e158b8604d..8fcde9107f02 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -467,10 +467,10 @@ static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long s
 					pgdat->node_start_pfn;
 }
 
-static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
 	int nid = pgdat->node_id;
 	int zone_type;
 	unsigned long flags, pfn;
@@ -500,24 +500,21 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 }
 
 static int __meminit __add_section(int nid, struct zone *zone,
-					unsigned long phys_start_pfn)
+		unsigned long pfn, unsigned long nr_pages)
 {
 	int ret;
 
-	if (pfn_valid(phys_start_pfn))
-		return -EEXIST;
-
-	ret = sparse_add_one_section(zone, phys_start_pfn);
+	ret = sparse_add_section(zone, pfn, nr_pages);
 
 	if (ret < 0)
 		return ret;
 
-	ret = __add_zone(zone, phys_start_pfn);
+	ret = __add_zone(zone, pfn, nr_pages);
 
 	if (ret < 0)
 		return ret;
 
-	return register_new_memory(zone, nid, __pfn_to_section(phys_start_pfn));
+	return register_new_memory(zone, nid, __pfn_to_section(pfn));
 }
 
 /*
@@ -526,7 +523,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(int nid, struct zone *zone, unsigned long pfn,
 			unsigned long nr_pages)
 {
 	int err = 0, i, start_sec, end_sec;
@@ -534,16 +531,12 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 
 	clear_zone_contiguous(zone);
 
-	/* during initialize mem_map, align hot-added range to section */
-	start_sec = pfn_to_section_nr(phys_start_pfn);
-	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
-	altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
+	altmap = to_vmem_altmap((unsigned long) pfn_to_page(pfn));
 	if (altmap) {
 		/*
 		 * Validate altmap is within bounds of the total request
 		 */
-		if (altmap->base_pfn != phys_start_pfn
+		if (altmap->base_pfn != pfn
 				|| vmem_altmap_offset(altmap) > nr_pages) {
 			pr_warn_once("memory add fail, invalid altmap\n");
 			err = -EINVAL;
@@ -552,8 +545,16 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 		altmap->alloc = 0;
 	}
 
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, zone, section_nr_to_pfn(i));
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		err = __add_section(nid, zone, pfn, pfns);
+		pfn += pfns;
+		nr_pages -= pfns;
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -759,10 +760,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
 	pgdat->node_spanned_pages = 0;
 }
 
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
 	int zone_type;
 	unsigned long flags;
 
@@ -774,11 +775,10 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-static int __remove_section(struct zone *zone, struct mem_section *ms,
-		unsigned long map_offset)
+static int __remove_section(struct zone *zone, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset)
 {
-	unsigned long start_pfn;
-	int scn_nr;
+	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
 	int ret = -EINVAL;
 
 	if (!valid_section(ms))
@@ -788,11 +788,9 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
 	if (ret)
 		return ret;
 
-	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn(scn_nr);
-	__remove_zone(zone, start_pfn);
+	__remove_zone(zone, pfn, nr_pages);
 
-	sparse_remove_one_section(zone, ms, map_offset);
+	sparse_remove_section(zone, ms, pfn, nr_pages, map_offset);
 	return 0;
 }
 
@@ -807,16 +805,15 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __remove_pages(struct zone *zone, unsigned long pfn,
 		 unsigned long nr_pages)
 {
-	unsigned long i;
 	unsigned long map_offset = 0;
-	int sections_to_remove, ret = 0;
+	int i, start_sec, end_sec, ret = 0;
 
 	/* In the ZONE_DEVICE case device driver owns the memory region */
 	if (is_dev_zone(zone)) {
-		struct page *page = pfn_to_page(phys_start_pfn);
+		struct page *page = pfn_to_page(pfn);
 		struct vmem_altmap *altmap;
 
 		altmap = to_vmem_altmap((unsigned long) page);
@@ -825,7 +822,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	} else {
 		resource_size_t start, size;
 
-		start = phys_start_pfn << PAGE_SHIFT;
+		start = pfn << PAGE_SHIFT;
 		size = nr_pages * PAGE_SIZE;
 
 		ret = release_mem_region_adjustable(&iomem_resource, start,
@@ -841,16 +838,26 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	clear_zone_contiguous(zone);
 
 	/*
-	 * We can only remove entire sections
+	 * Only ZONE_DEVICE memory is enabled to remove
+	 * section-unaligned ranges. See register_new_memory() which
+	 * assumes section alignment and is skipped for ZONE_DEVICE
+	 * ranges.
 	 */
-	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
-	BUG_ON(nr_pages % PAGES_PER_SECTION);
+	if (!is_dev_zone(zone) && ((pfn | nr_pages) & ~PAGE_SECTION_MASK)) {
+		WARN(1, "section unaligned removal not supported\n");
+		return -EINVAL;
+	}
 
-	sections_to_remove = nr_pages / PAGES_PER_SECTION;
-	for (i = 0; i < sections_to_remove; i++) {
-		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	for (i = start_sec; i <= end_sec; i++) {
+		unsigned long pfns;
 
-		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		ret = __remove_section(zone, pfn, pfns, map_offset);
+		pfn += pfns;
+		nr_pages -= pfns;
 		map_offset = 0;
 		if (ret)
 			break;
diff --git a/mm/sparse.c b/mm/sparse.c
index 2f571e371ec1..ba5b30d6e076 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -753,7 +753,8 @@ static void free_map_bootmem(struct page *memmap)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_section(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
 	struct pglist_data *pgdat = zone->zone_pgdat;
@@ -855,7 +856,8 @@ static void free_section_usage(struct page *memmap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+void sparse_remove_section(struct zone *zone, struct mem_section *ms,
+		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From 81a10d016a21bcbe6d38374890d4f07ee023ce62 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 8 Feb 2017 10:27:08 +1100
Subject: memblock: embed memblock type name within struct memblock_type

Provide the name of each memblock type with struct memblock_type. This
allows to get rid of the function memblock_type_name() and duplicating
the type names in __memblock_dump_all().

The only memblock_type usage out of mm/memblock.c seems to be
arch/s390/kernel/crash_dump.c. While at it, give it a name.

Link: http://lkml.kernel.org/r/20170120123456.46508-4-heiko.carstens@de.ibm.com
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Philipp Hachtmann <phacht@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/kernel/crash_dump.c |  1 +
 include/linux/memblock.h      |  1 +
 mm/memblock.c                 | 35 +++++++++++------------------------
 3 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index f9293bfefb7f..9c9440dc253a 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -31,6 +31,7 @@ static struct memblock_type oldmem_type = {
 	.max = 1,
 	.total_size = 0,
 	.regions = &oldmem_region,
+	.name = "oldmem",
 };
 
 struct save_area {
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 38bcf00cbed3..bdfc65af4152 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock_type {
 	unsigned long max;	/* size of the allocated array */
 	phys_addr_t total_size;	/* size of all regions */
 	struct memblock_region *regions;
+	char *name;
 };
 
 struct memblock {
diff --git a/mm/memblock.c b/mm/memblock.c
index 647d79ed07a3..b64b47803e52 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -35,15 +35,18 @@ struct memblock memblock __initdata_memblock = {
 	.memory.regions		= memblock_memory_init_regions,
 	.memory.cnt		= 1,	/* empty dummy entry */
 	.memory.max		= INIT_MEMBLOCK_REGIONS,
+	.memory.name		= "memory",
 
 	.reserved.regions	= memblock_reserved_init_regions,
 	.reserved.cnt		= 1,	/* empty dummy entry */
 	.reserved.max		= INIT_MEMBLOCK_REGIONS,
+	.reserved.name		= "reserved",
 
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 	.physmem.regions	= memblock_physmem_init_regions,
 	.physmem.cnt		= 1,	/* empty dummy entry */
 	.physmem.max		= INIT_PHYSMEM_REGIONS,
+	.physmem.name		= "physmem",
 #endif
 
 	.bottom_up		= false,
@@ -64,22 +67,6 @@ ulong __init_memblock choose_memblock_flags(void)
 	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
 }
 
-/* inline so we don't get a warning when pr_debug is compiled out */
-static __init_memblock const char *
-memblock_type_name(struct memblock_type *type)
-{
-	if (type == &memblock.memory)
-		return "memory";
-	else if (type == &memblock.reserved)
-		return "reserved";
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	else if (type == &memblock.physmem)
-		return "physmem";
-#endif
-	else
-		return "unknown";
-}
-
 /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
 static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
 {
@@ -406,12 +393,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
 	}
 	if (!addr) {
 		pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
-		       memblock_type_name(type), type->max, type->max * 2);
+		       type->name, type->max, type->max * 2);
 		return -1;
 	}
 
 	memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
-			memblock_type_name(type), type->max * 2, (u64)addr,
+			type->name, type->max * 2, (u64)addr,
 			(u64)addr + new_size - 1);
 
 	/*
@@ -1697,14 +1684,14 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
 	return memblock.current_limit;
 }
 
-static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
+static void __init_memblock memblock_dump(struct memblock_type *type)
 {
 	phys_addr_t base, end, size;
 	unsigned long flags;
 	int idx;
 	struct memblock_region *rgn;
 
-	pr_info(" %s.cnt  = 0x%lx\n", name, type->cnt);
+	pr_info(" %s.cnt  = 0x%lx\n", type->name, type->cnt);
 
 	for_each_memblock_type(type, rgn) {
 		char nid_buf[32] = "";
@@ -1719,7 +1706,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
 				 memblock_get_region_node(rgn));
 #endif
 		pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
-			name, idx, &base, &end, &size, nid_buf, flags);
+			type->name, idx, &base, &end, &size, nid_buf, flags);
 	}
 }
 
@@ -1730,10 +1717,10 @@ void __init_memblock __memblock_dump_all(void)
 		&memblock.memory.total_size,
 		&memblock.reserved.total_size);
 
-	memblock_dump(&memblock.memory, "memory");
-	memblock_dump(&memblock.reserved, "reserved");
+	memblock_dump(&memblock.memory);
+	memblock_dump(&memblock.reserved);
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	memblock_dump(&memblock.physmem, "physmem");
+	memblock_dump(&memblock.physmem);
 #endif
 }
 
-- 
cgit v1.2.3


From 84b8d1a5877f0051ed63ad72fdc7427dacc336fe Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 10:27:09 +1100
Subject: userfaultfd: non-cooperative: rename *EVENT_MADVDONTNEED to
 *EVENT_REMOVE

Patch series "userfaultfd: non-cooperative: add madvise() event for
MADV_REMOVE request".

These patches add notification of madvise(MADV_REMOVE) event to
non-cooperative userfaultfd monitor.

The first pacth renames EVENT_MADVDONTNEED to EVENT_REMOVE along with
relevant functions and structures.  Using _REMOVE instead of _MADVDONTNEED
describes the event semantics more clearly and I hope it's not too late
for such change in the ABI.

This patch (of 3):

The UFFD_EVENT_MADVDONTNEED purpose is to notify uffd monitor about
removal of certain range from address space tracked by userfaultfd.
Hence, UFFD_EVENT_REMOVE seems to better reflect the operation semantics.
Respectively, 'madv_dn' field of uffd_msg is renamed to 'remove' and the
madvise_userfault_dontneed callback is renamed to userfaultfd_remove.

Link: http://lkml.kernel.org/r/1484814154-1557-2-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                         | 14 +++++++-------
 include/linux/userfaultfd_k.h            | 16 ++++++++--------
 include/uapi/linux/userfaultfd.h         |  8 ++++----
 mm/madvise.c                             |  2 +-
 tools/testing/selftests/vm/userfaultfd.c | 16 ++++++++--------
 5 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 18406158e13f..8fe601b4875e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -681,16 +681,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
-void madvise_userfault_dontneed(struct vm_area_struct *vma,
-				struct vm_area_struct **prev,
-				unsigned long start, unsigned long end)
+void userfaultfd_remove(struct vm_area_struct *vma,
+			struct vm_area_struct **prev,
+			unsigned long start, unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue ewq;
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
-	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 		return;
 
 	userfaultfd_ctx_get(ctx);
@@ -700,9 +700,9 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma,
 
 	msg_init(&ewq.msg);
 
-	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
-	ewq.msg.arg.madv_dn.start = start;
-	ewq.msg.arg.madv_dn.end = end;
+	ewq.msg.event = UFFD_EVENT_REMOVE;
+	ewq.msg.arg.remove.start = start;
+	ewq.msg.arg.remove.end = end;
 
 	userfaultfd_event_wait_completion(ctx, &ewq);
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f431861f22f1..2521542f6c07 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,10 +61,10 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
 
-extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
-				       struct vm_area_struct **prev,
-				       unsigned long start,
-				       unsigned long end);
+extern void userfaultfd_remove(struct vm_area_struct *vma,
+			       struct vm_area_struct **prev,
+			       unsigned long start,
+			       unsigned long end);
 
 #else /* CONFIG_USERFAULTFD */
 
@@ -112,10 +112,10 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 {
 }
 
-static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
-					      struct vm_area_struct **prev,
-					      unsigned long start,
-					      unsigned long end)
+static inline void userfaultfd_remove(struct vm_area_struct *vma,
+				      struct vm_area_struct **prev,
+				      unsigned long start,
+				      unsigned long end)
 {
 }
 #endif /* CONFIG_USERFAULTFD */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9ac4b68c54d1..b742c40c2880 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -20,7 +20,7 @@
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
-			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
+			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
@@ -92,7 +92,7 @@ struct uffd_msg {
 		struct {
 			__u64	start;
 			__u64	end;
-		} madv_dn;
+		} remove;
 
 		struct {
 			/* unused reserved fields */
@@ -109,7 +109,7 @@ struct uffd_msg {
 #define UFFD_EVENT_PAGEFAULT	0x12
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
-#define UFFD_EVENT_MADVDONTNEED	0x15
+#define UFFD_EVENT_REMOVE	0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -155,7 +155,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #define UFFD_FEATURE_EVENT_REMAP		(1<<2)
-#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
+#define UFFD_FEATURE_EVENT_REMOVE		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 	__u64 features;
diff --git a/mm/madvise.c b/mm/madvise.c
index b530a4986035..ab5ef141cc9b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -479,7 +479,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 	if (!can_madv_dontneed_vma(vma))
 		return -EINVAL;
 
-	madvise_userfault_dontneed(vma, prev, start, end);
+	userfaultfd_remove(vma, prev, start, end);
 	zap_page_range(vma, start, end - start);
 	return 0;
 }
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 5a840a605a16..9eb77df568f7 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -398,12 +398,12 @@ static void *uffd_poll_thread(void *arg)
 			uffd = msg.arg.fork.ufd;
 			pollfd[0].fd = uffd;
 			break;
-		case UFFD_EVENT_MADVDONTNEED:
-			uffd_reg.range.start = msg.arg.madv_dn.start;
-			uffd_reg.range.len = msg.arg.madv_dn.end -
-				msg.arg.madv_dn.start;
+		case UFFD_EVENT_REMOVE:
+			uffd_reg.range.start = msg.arg.remove.start;
+			uffd_reg.range.len = msg.arg.remove.end -
+				msg.arg.remove.start;
 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
-				fprintf(stderr, "madv_dn failure\n"), exit(1);
+				fprintf(stderr, "remove failure\n"), exit(1);
 			break;
 		case UFFD_EVENT_REMAP:
 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
@@ -570,7 +570,7 @@ static int userfaultfd_open(int features)
  * mremap, the entire monitored area is accessed in a single pass for
  * HUGETLB_TEST.
  * The release of the pages currently generates event only for
- * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
  * for hugetlb and shmem.
  */
 static int faulting_process(void)
@@ -715,14 +715,14 @@ static int userfaultfd_events_test(void)
 	pid_t pid;
 	char c;
 
-	printf("testing events (fork, remap, madv_dn): ");
+	printf("testing events (fork, remap, remove): ");
 	fflush(stdout);
 
 	if (release_pages(area_dst))
 		return 1;
 
 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
-		UFFD_FEATURE_EVENT_MADVDONTNEED;
+		UFFD_FEATURE_EVENT_REMOVE;
 	if (userfaultfd_open(features) < 0)
 		return 1;
 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-- 
cgit v1.2.3


From 2b837027cdc60ac391600a8b175dce3b9f1e6adf Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 8 Feb 2017 10:27:10 +1100
Subject: mm: vmscan: scan dirty pages even in laptop mode

Patch series "mm: vmscan: fix kswapd writeback regression".

We noticed a regression on multiple hadoop workloads when moving from 3.10
to 4.0 and 4.6, which involves kswapd getting tangled up in page writeout,
causing direct reclaim herds that also don't make progress.

I tracked it down to the thrash avoidance efforts after 3.10 that make the
kernel better at keeping use-once cache and use-many cache sorted on the
inactive and active list, with more aggressive protection of the active
list as long as there is inactive cache.  Unfortunately, our workload's
use-once cache is mostly from streaming writes.  Waiting for writes to
avoid potential reloads in the future is not a good tradeoff.

These patches do the following:

1. Wake the flushers when kswapd sees a lump of dirty pages. It's
   possible to be below the dirty background limit and still have
   cache velocity push them through the LRU. So start a-flushin'.

2. Let kswapd only write pages that have been rotated twice. This
   makes sure we really tried to get all the clean pages on the
   inactive list before resorting to horrible LRU-order writeback.

3. Move rotating dirty pages off the inactive list. Instead of
   churning or waiting on page writeback, we'll go after clean active
   cache. This might lead to thrashing, but in this state memory
   demand outstrips IO speed anyway, and reads are faster than writes.

Mel backported the series to 4.10-rc5 with one minor conflict and ran a
couple of tests on it.  Mix of read/write random workload didn't show
anything interesting.  Write-only database didn't show much difference
in performance but there were slight reductions in IO -- probably in
the noise.

simoop did show big differences although not as big as Mel expected.
This is Chris Mason's workload that similate the VM activity of hadoop.
Mel won't go through the full details but over the samples measured
during an hour it reported

                                         4.10.0-rc5            4.10.0-rc5
                                            vanilla         johannes-v1r1
Amean    p50-Read             21346531.56 (  0.00%) 21697513.24 ( -1.64%)
Amean    p95-Read             24700518.40 (  0.00%) 25743268.98 ( -4.22%)
Amean    p99-Read             27959842.13 (  0.00%) 28963271.11 ( -3.59%)
Amean    p50-Write                1138.04 (  0.00%)      989.82 ( 13.02%)
Amean    p95-Write             1106643.48 (  0.00%)    12104.00 ( 98.91%)
Amean    p99-Write             1569213.22 (  0.00%)    36343.38 ( 97.68%)
Amean    p50-Allocation          85159.82 (  0.00%)    79120.70 (  7.09%)
Amean    p95-Allocation         204222.58 (  0.00%)   129018.43 ( 36.82%)
Amean    p99-Allocation         278070.04 (  0.00%)   183354.43 ( 34.06%)
Amean    final-p50-Read       21266432.00 (  0.00%) 21921792.00 ( -3.08%)
Amean    final-p95-Read       24870912.00 (  0.00%) 26116096.00 ( -5.01%)
Amean    final-p99-Read       28147712.00 (  0.00%) 29523968.00 ( -4.89%)
Amean    final-p50-Write          1130.00 (  0.00%)      977.00 ( 13.54%)
Amean    final-p95-Write       1033216.00 (  0.00%)     2980.00 ( 99.71%)
Amean    final-p99-Write       1517568.00 (  0.00%)    32672.00 ( 97.85%)
Amean    final-p50-Allocation    86656.00 (  0.00%)    78464.00 (  9.45%)
Amean    final-p95-Allocation   211712.00 (  0.00%)   116608.00 ( 44.92%)
Amean    final-p99-Allocation   287232.00 (  0.00%)   168704.00 ( 41.27%)

The latencies are actually completely horrific in comparison to 4.4
(and 4.10-rc5 is worse than 4.9 according to historical data for
reasons Mel hasn't analysed yet).

Still, 95% of write latency (p95-write) is halved by the series and
allocation latency is way down.  Direct reclaim activity is one fifth
of what it was according to vmstats.  Kswapd activity is higher but
this is not necessarily surprising.  Kswapd efficiency is unchanged at
99% (99% of pages scanned were reclaimed) but direct reclaim efficiency
went from 77% to 99%

In the vanilla kernel, 627MB of data was written back from reclaim
context.  With the series, no data was written back.  With or without
the patch, pages are being immediately reclaimed after writeback
completes.  However, with the patch, only 1/8th of the pages are
reclaimed like this.

This patch (of 5):

We have an elaborate dirty/writeback throttling mechanism inside the
reclaim scanner, but for that to work the pages have to go through
shrink_page_list() and get counted for what they are.  Otherwise, we mess
up the LRU order and don't match reclaim speed to writeback.

Especially during deactivation, there is never a reason to skip dirty
pages; nothing is even trying to write them out from there.  Don't mess up
the LRU order for nothing, shuffle these pages along.

Link: http://lkml.kernel.org/r/20170123181641.23938-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h |  2 --
 mm/vmscan.c            | 14 ++------------
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index df992831fde7..338a786a993f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -236,8 +236,6 @@ struct lruvec {
 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
 
-/* Isolate clean file */
-#define ISOLATE_CLEAN		((__force isolate_mode_t)0x1)
 /* Isolate unmapped file */
 #define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7bb23ff229b6..0d05f7f3b532 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -87,6 +87,7 @@ struct scan_control {
 	/* The highest zone to isolate pages for reclaim from */
 	enum zone_type reclaim_idx;
 
+	/* Writepage batching in laptop mode; RECLAIM_WRITE */
 	unsigned int may_writepage:1;
 
 	/* Can mapped pages be reclaimed? */
@@ -1373,13 +1374,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 	 * wants to isolate pages it will be able to operate on without
 	 * blocking - clean pages for the most part.
 	 *
-	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
-	 * is used by reclaim when it is cannot write to backing storage
-	 *
 	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
 	 * that it is possible to migrate without blocking
 	 */
-	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+	if (mode & ISOLATE_ASYNC_MIGRATE) {
 		/* All the caller can do on PageWriteback is block */
 		if (PageWriteback(page))
 			return ret;
@@ -1387,10 +1385,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 		if (PageDirty(page)) {
 			struct address_space *mapping;
 
-			/* ISOLATE_CLEAN means only clean pages */
-			if (mode & ISOLATE_CLEAN)
-				return ret;
-
 			/*
 			 * Only pages without mappings or that have a
 			 * ->migratepage callback are possible to migrate
@@ -1731,8 +1725,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
-	if (!sc->may_writepage)
-		isolate_mode |= ISOLATE_CLEAN;
 
 	spin_lock_irq(&pgdat->lru_lock);
 
@@ -1929,8 +1921,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
-	if (!sc->may_writepage)
-		isolate_mode |= ISOLATE_CLEAN;
 
 	spin_lock_irq(&pgdat->lru_lock);
 
-- 
cgit v1.2.3


From e932fd3f1772e1fbc0b90dcfb4fbc689729a48f8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 8 Feb 2017 10:27:11 +1100
Subject: mm: vmscan: kick flushers when we encounter dirty pages on the LRU

Memory pressure can put dirty pages at the end of the LRU without anybody
running into dirty limits.  Don't start writing individual pages from
kswapd while the flushers might be asleep.

Unlike the old direct reclaim flusher wakeup (removed in the next patch)
that flushes the number of pages just scanned, this patch wakes the
flushers for all outstanding dirty pages.  That seemed to perform better
in a synthetic test that pushes dirty pages to the end of the LRU and into
reclaim, because we know LRU aging outstrips writeback already, and this
way we give younger dirty pages a headstart rather than wait until reclaim
runs into them as well.  It also means less plugging and risk of
exhausting the struct request pool from reclaim.

There is a concern that this will cause temporary files that used to get
dirtied and truncated before writeback to now get written to disk under
memory pressure.  If this turns out to be a real problem, we'll have to
revisit this and tame the reclaim flusher wakeups.

Link: http://lkml.kernel.org/r/20170123181641.23938-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h        |  2 +-
 include/trace/events/writeback.h |  2 +-
 mm/vmscan.c                      | 18 +++++++++++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5527d910ba3d..a3c0cbd7c888 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -46,7 +46,7 @@ enum writeback_sync_modes {
  */
 enum wb_reason {
 	WB_REASON_BACKGROUND,
-	WB_REASON_TRY_TO_FREE_PAGES,
+	WB_REASON_VMSCAN,
 	WB_REASON_SYNC,
 	WB_REASON_PERIODIC,
 	WB_REASON_LAPTOP_TIMER,
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 2ccd9ccbf9ef..7bd8783a590f 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -31,7 +31,7 @@
 
 #define WB_WORK_REASON							\
 	EM( WB_REASON_BACKGROUND,		"background")		\
-	EM( WB_REASON_TRY_TO_FREE_PAGES,	"try_to_free_pages")	\
+	EM( WB_REASON_VMSCAN,			"vmscan")		\
 	EM( WB_REASON_SYNC,			"sync")			\
 	EM( WB_REASON_PERIODIC,			"periodic")		\
 	EM( WB_REASON_LAPTOP_TIMER,		"laptop_timer")		\
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0d05f7f3b532..56ea8d24041f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1798,12 +1798,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 		/*
 		 * If dirty pages are scanned that are not queued for IO, it
-		 * implies that flushers are not keeping up. In this case, flag
-		 * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
-		 * reclaim context.
+		 * implies that flushers are not doing their job. This can
+		 * happen when memory pressure pushes dirty pages to the end
+		 * of the LRU without the dirty limits being breached. It can
+		 * also happen when the proportion of dirty pages grows not
+		 * through writes but through memory pressure reclaiming all
+		 * the clean cache. And in some cases, the flushers simply
+		 * cannot keep up with the allocation rate. Nudge the flusher
+		 * threads in case they are asleep, but also allow kswapd to
+		 * start writing pages during reclaim.
 		 */
-		if (stat.nr_unqueued_dirty == nr_taken)
+		if (stat.nr_unqueued_dirty == nr_taken) {
+			wakeup_flusher_threads(0, WB_REASON_VMSCAN);
 			set_bit(PGDAT_DIRTY, &pgdat->flags);
+		}
 
 		/*
 		 * If kswapd scans pages marked marked for immediate
@@ -2787,7 +2795,7 @@ retry:
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
 			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
-						WB_REASON_TRY_TO_FREE_PAGES);
+						WB_REASON_VMSCAN);
 			sc->may_writepage = 1;
 		}
 	} while (--sc->priority >= 0);
-- 
cgit v1.2.3


From d2ba09242505d902dcf1eea6397a1679da033368 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 8 Feb 2017 10:27:12 +1100
Subject: mm: vmscan: move dirty pages out of the way until they're flushed

We noticed a performance regression when moving hadoop workloads from 3.10
kernels to 4.0 and 4.6.  This is accompanied by increased pageout activity
initiated by kswapd as well as frequent bursts of allocation stalls and
direct reclaim scans.  Even lowering the dirty ratios to the equivalent of
less than 1% of memory would not eliminate the issue, suggesting that
dirty pages concentrate where the scanner is looking.

This can be traced back to recent efforts of thrash avoidance.  Where 3.10
would not detect refaulting pages and continuously supply clean cache to
the inactive list, a thrashing workload on 4.0+ will detect and activate
refaulting pages right away, distilling used-once pages on the inactive
list much more effectively.  This is by design, and it makes sense for
clean cache.  But for the most part our workload's cache faults are
refaults and its use-once cache is from streaming writes.  We end up with
most of the inactive list dirty, and we don't go after the active cache as
long as we have use-once pages around.

But waiting for writes to avoid reclaiming clean cache that *might*
refault is a bad trade-off.  Even if the refaults happen, reads are faster
than writes.  Before getting bogged down on writeback, reclaim should
first look at *all* cache in the system, even active cache.

To accomplish this, activate pages that are dirty or under writeback
when they reach the end of the inactive LRU.  The pages are marked for
immediate reclaim, meaning they'll get moved back to the inactive LRU
tail as soon as they're written back and become reclaimable.  But in
the meantime, by reducing the inactive list to only immediately
reclaimable pages, we allow the scanner to deactivate and refill the
inactive list with clean cache from the active list tail to guarantee
forward progress.

Link: http://lkml.kernel.org/r/20170123181641.23938-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_inline.h | 7 +++++++
 mm/swap.c                 | 9 +++++----
 mm/vmscan.c               | 6 +++---
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 41d376e7116d..e030a68ead7e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -50,6 +50,13 @@ static __always_inline void add_page_to_lru_list(struct page *page,
 	list_add(&page->lru, &lruvec->lists[lru]);
 }
 
+static __always_inline void add_page_to_lru_list_tail(struct page *page,
+				struct lruvec *lruvec, enum lru_list lru)
+{
+	update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
+	list_add_tail(&page->lru, &lruvec->lists[lru]);
+}
+
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/swap.c b/mm/swap.c
index aabf2e90fe32..c4910f14f957 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,9 +209,10 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
 {
 	int *pgmoved = arg;
 
-	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-		enum lru_list lru = page_lru_base_type(page);
-		list_move_tail(&page->lru, &lruvec->lists[lru]);
+	if (PageLRU(page) && !PageUnevictable(page)) {
+		del_page_from_lru_list(page, lruvec, page_lru(page));
+		ClearPageActive(page);
+		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
 		(*pgmoved)++;
 	}
 }
@@ -235,7 +236,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
  */
 void rotate_reclaimable_page(struct page *page)
 {
-	if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
+	if (!PageLocked(page) && !PageDirty(page) &&
 	    !PageUnevictable(page) && PageLRU(page)) {
 		struct pagevec *pvec;
 		unsigned long flags;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 92e56cadceae..70103f411247 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1063,7 +1063,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			    PageReclaim(page) &&
 			    test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
 				nr_immediate++;
-				goto keep_locked;
+				goto activate_locked;
 
 			/* Case 2 above */
 			} else if (sane_reclaim(sc) ||
@@ -1081,7 +1081,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 */
 				SetPageReclaim(page);
 				nr_writeback++;
-				goto keep_locked;
+				goto activate_locked;
 
 			/* Case 3 above */
 			} else {
@@ -1174,7 +1174,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
 				SetPageReclaim(page);
 
-				goto keep_locked;
+				goto activate_locked;
 			}
 
 			if (references == PAGEREF_RECLAIM_CLEAN)
-- 
cgit v1.2.3


From a8fadab82e581099d6e3299447b76f7760bf5b1b Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Feb 2017 10:27:16 +1100
Subject: mm, fs: reduce fault, page_mkwrite, and pfn_mkwrite to take only vmf

->fault(), ->page_mkwrite(), and ->pfn_mkwrite() calls do not need to take
a vma and vmf parameter when the vma already resides in vmf.  Remove the
vma parameter to simplify things.

Link: http://lkml.kernel.org/r/148521301778.19116.10840599906674778980.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_64_vio.c                 |  4 +--
 arch/powerpc/platforms/cell/spufs/file.c         | 39 ++++++++++++------------
 drivers/android/binder.c                         |  2 +-
 drivers/char/agp/alpha-agp.c                     |  5 ++-
 drivers/char/mspec.c                             |  6 ++--
 drivers/dax/dax.c                                | 12 ++++----
 drivers/gpu/drm/drm_vm.c                         | 32 ++++++++++---------
 drivers/gpu/drm/etnaviv/etnaviv_gem.c            |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c          |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.h          |  2 +-
 drivers/gpu/drm/gma500/framebuffer.c             |  3 +-
 drivers/gpu/drm/gma500/gem.c                     |  3 +-
 drivers/gpu/drm/gma500/psb_drv.h                 |  2 +-
 drivers/gpu/drm/i915/i915_drv.h                  |  2 +-
 drivers/gpu/drm/i915/i915_gem.c                  |  4 +--
 drivers/gpu/drm/msm/msm_drv.h                    |  2 +-
 drivers/gpu/drm/msm/msm_gem.c                    |  3 +-
 drivers/gpu/drm/omapdrm/omap_gem.c               |  4 +--
 drivers/gpu/drm/qxl/qxl_ttm.c                    |  6 ++--
 drivers/gpu/drm/radeon/radeon_ttm.c              |  6 ++--
 drivers/gpu/drm/tegra/gem.c                      |  3 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c                  |  8 ++---
 drivers/gpu/drm/udl/udl_drv.h                    |  2 +-
 drivers/gpu/drm/udl/udl_gem.c                    |  3 +-
 drivers/gpu/drm/vgem/vgem_drv.c                  |  3 +-
 drivers/gpu/drm/virtio/virtgpu_ttm.c             |  7 ++---
 drivers/hwtracing/intel_th/msu.c                 |  6 ++--
 drivers/infiniband/hw/hfi1/file_ops.c            |  4 +--
 drivers/infiniband/hw/qib/qib_file_ops.c         |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c        |  3 +-
 drivers/misc/cxl/context.c                       |  3 +-
 drivers/misc/sgi-gru/grumain.c                   |  3 +-
 drivers/misc/sgi-gru/grutables.h                 |  2 +-
 drivers/scsi/cxlflash/superpipe.c                |  6 ++--
 drivers/scsi/sg.c                                |  3 +-
 drivers/staging/android/ion/ion.c                |  6 ++--
 drivers/staging/lustre/lustre/llite/llite_mmap.c |  7 +++--
 drivers/staging/lustre/lustre/llite/vvp_io.c     |  2 +-
 drivers/target/target_core_user.c                |  6 ++--
 drivers/uio/uio.c                                |  6 ++--
 drivers/usb/mon/mon_bin.c                        |  4 +--
 drivers/video/fbdev/core/fb_defio.c              | 16 +++++-----
 drivers/xen/privcmd.c                            |  4 +--
 fs/9p/vfs_file.c                                 |  4 +--
 fs/btrfs/ctree.h                                 |  2 +-
 fs/btrfs/inode.c                                 |  6 ++--
 fs/ceph/addr.c                                   |  8 +++--
 fs/cifs/file.c                                   |  2 +-
 fs/dax.c                                         | 15 ++++-----
 fs/ext2/file.c                                   | 17 +++++------
 fs/ext4/ext4.h                                   |  4 +--
 fs/ext4/file.c                                   | 17 +++++------
 fs/ext4/inode.c                                  |  9 +++---
 fs/f2fs/file.c                                   |  7 ++---
 fs/fuse/file.c                                   |  6 ++--
 fs/gfs2/file.c                                   |  8 ++---
 fs/iomap.c                                       |  5 ++-
 fs/kernfs/file.c                                 | 13 ++++----
 fs/ncpfs/mmap.c                                  |  7 ++---
 fs/nfs/file.c                                    |  4 +--
 fs/nilfs2/file.c                                 |  3 +-
 fs/ocfs2/mmap.c                                  | 15 ++++-----
 fs/proc/vmcore.c                                 |  4 +--
 fs/ubifs/file.c                                  |  5 ++-
 fs/xfs/xfs_file.c                                | 25 +++++++--------
 include/linux/dax.h                              |  5 ++-
 include/linux/iomap.h                            |  3 +-
 include/linux/mm.h                               | 10 +++---
 ipc/shm.c                                        |  6 ++--
 kernel/events/core.c                             |  6 ++--
 kernel/relay.c                                   |  4 +--
 mm/filemap.c                                     | 19 ++++++------
 mm/hugetlb.c                                     |  2 +-
 mm/memory.c                                      |  6 ++--
 mm/mmap.c                                        |  9 +++---
 mm/nommu.c                                       |  2 +-
 mm/shmem.c                                       |  3 +-
 security/selinux/selinuxfs.c                     |  5 ++-
 sound/core/pcm_native.c                          | 15 ++++-----
 sound/usb/usx2y/us122l.c                         |  5 ++-
 sound/usb/usx2y/usX2Yhwdep.c                     |  7 ++---
 sound/usb/usx2y/usx2yhwdeppcm.c                  |  5 ++-
 virt/kvm/kvm_main.c                              |  4 +--
 83 files changed, 273 insertions(+), 281 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c379ff5a4438..d71f872c9414 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -102,9 +102,9 @@ static void release_spapr_tce_table(struct rcu_head *head)
 	kfree(stt);
 }
 
-static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_spapr_tce_fault(struct vm_fault *vmf)
 {
-	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
 	struct page *page;
 
 	if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index a35e2c29d7ee..e5ec1368f0cd 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -233,8 +233,9 @@ spufs_mem_write(struct file *file, const char __user *buffer,
 }
 
 static int
-spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mem_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct spu_context *ctx	= vma->vm_file->private_data;
 	unsigned long pfn, offset;
 
@@ -311,12 +312,11 @@ static const struct file_operations spufs_mem_fops = {
 	.mmap			= spufs_mem_mmap,
 };
 
-static int spufs_ps_fault(struct vm_area_struct *vma,
-				    struct vm_fault *vmf,
+static int spufs_ps_fault(struct vm_fault *vmf,
 				    unsigned long ps_offs,
 				    unsigned long ps_size)
 {
-	struct spu_context *ctx = vma->vm_file->private_data;
+	struct spu_context *ctx = vmf->vma->vm_file->private_data;
 	unsigned long area, offset = vmf->pgoff << PAGE_SHIFT;
 	int ret = 0;
 
@@ -354,7 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
 		down_read(&current->mm->mmap_sem);
 	} else {
 		area = ctx->spu->problem_phys + ps_offs;
-		vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
+		vm_insert_pfn(vmf->vma, vmf->address, (area + offset) >> PAGE_SHIFT);
 		spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
 	}
 
@@ -367,10 +367,9 @@ refault:
 }
 
 #if SPUFS_MMAP_4K
-static int spufs_cntl_mmap_fault(struct vm_area_struct *vma,
-					   struct vm_fault *vmf)
+static int spufs_cntl_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x4000, SPUFS_CNTL_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_cntl_mmap_vmops = {
@@ -1067,15 +1066,15 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 }
 
 static int
-spufs_signal1_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_signal1_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
-	return spufs_ps_fault(vma, vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE);
 #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
@@ -1205,15 +1204,15 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 
 #if SPUFS_MMAP_4K
 static int
-spufs_signal2_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_signal2_mmap_fault(struct vm_fault *vmf)
 {
 #if SPUFS_SIGNAL_MAP_SIZE == 0x1000
-	return spufs_ps_fault(vma, vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE);
 #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE);
 #else
 #error unsupported page size
 #endif
@@ -1334,9 +1333,9 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get,
 
 #if SPUFS_MMAP_4K
 static int
-spufs_mss_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mss_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x0000, SPUFS_MSS_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_mss_mmap_vmops = {
@@ -1396,9 +1395,9 @@ static const struct file_operations spufs_mss_fops = {
 };
 
 static int
-spufs_psmap_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_psmap_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x0000, SPUFS_PS_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_psmap_mmap_vmops = {
@@ -1456,9 +1455,9 @@ static const struct file_operations spufs_psmap_fops = {
 
 #if SPUFS_MMAP_4K
 static int
-spufs_mfc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+spufs_mfc_mmap_fault(struct vm_fault *vmf)
 {
-	return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
+	return spufs_ps_fault(vmf, 0x3000, SPUFS_MFC_MAP_SIZE);
 }
 
 static const struct vm_operations_struct spufs_mfc_mmap_vmops = {
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d97f6725cf8c..79d1727ae3eb 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2856,7 +2856,7 @@ static void binder_vma_close(struct vm_area_struct *vma)
 	binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES);
 }
 
-static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int binder_vm_fault(struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 737187865269..53fe633df1e8 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -11,15 +11,14 @@
 
 #include "agp.h"
 
-static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
-					struct vm_fault *vmf)
+static int alpha_core_agp_vm_fault(struct vm_fault *vmf)
 {
 	alpha_agp_info *agp = agp_bridge->dev_private_data;
 	dma_addr_t dma_addr;
 	unsigned long pa;
 	struct page *page;
 
-	dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
+	dma_addr = vmf->address - vmf->vma->vm_start + agp->aperture.bus_base;
 	pa = agp->ops->translate(agp, dma_addr);
 
 	if (pa == (unsigned long)-EINVAL)
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index a697ca0cab1e..a9c2fa3c81e5 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -191,12 +191,12 @@ mspec_close(struct vm_area_struct *vma)
  * Creates a mspec page and maps it to user space.
  */
 static int
-mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+mspec_fault(struct vm_fault *vmf)
 {
 	unsigned long paddr, maddr;
 	unsigned long pfn;
 	pgoff_t index = vmf->pgoff;
-	struct vma_data *vdata = vma->vm_private_data;
+	struct vma_data *vdata = vmf->vma->vm_private_data;
 
 	maddr = (volatile unsigned long) vdata->maddr[index];
 	if (maddr == 0) {
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * be because another thread has installed the pte first, so it
 	 * is no problem.
 	 */
-	vm_insert_pfn(vma, vmf->address, pfn);
+	vm_insert_pfn(vmf->vma, vmf->address, pfn);
 
 	return VM_FAULT_NOPAGE;
 }
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 18e9875f6277..0261f332bf3e 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -419,8 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 	return -1;
 }
 
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
-		struct vm_fault *vmf)
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
@@ -428,7 +427,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	phys_addr_t phys;
 	pfn_t pfn;
 
-	if (check_vma(dax_dev, vma, __func__))
+	if (check_vma(dax_dev, vmf->vma, __func__))
 		return VM_FAULT_SIGBUS;
 
 	dax_region = dax_dev->region;
@@ -446,7 +445,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 
 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-	rc = vm_insert_mixed(vma, vmf->address, pfn);
+	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
 
 	if (rc == -ENOMEM)
 		return VM_FAULT_OOM;
@@ -456,8 +455,9 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
 	return VM_FAULT_NOPAGE;
 }
 
-static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	int rc;
 	struct file *filp = vma->vm_file;
 	struct dax_dev *dax_dev = filp->private_data;
@@ -466,7 +466,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
 			? "write" : "read", vma->vm_start, vma->vm_end);
 	rcu_read_lock();
-	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rc = __dax_dev_fault(dax_dev, vmf);
 	rcu_read_unlock();
 
 	return rc;
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index bd311c77c254..bae6e26038ee 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -96,8 +96,9 @@ static pgprot_t drm_dma_prot(uint32_t map_type, struct vm_area_struct *vma)
  * map, get the page, increment the use count and return it.
  */
 #if IS_ENABLED(CONFIG_AGP)
-static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
 	struct drm_local_map *map = NULL;
@@ -168,7 +169,7 @@ vm_fault_error:
 	return VM_FAULT_SIGBUS;	/* Disallow mremap */
 }
 #else
-static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_fault(struct vm_fault *vmf)
 {
 	return VM_FAULT_SIGBUS;
 }
@@ -184,8 +185,9 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  * Get the mapping, find the real physical page to map, get the page, and
  * return it.
  */
-static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_shm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_local_map *map = vma->vm_private_data;
 	unsigned long offset;
 	unsigned long i;
@@ -280,14 +282,14 @@ static void drm_vm_shm_close(struct vm_area_struct *vma)
 /**
  * \c fault method for DMA virtual memory.
  *
- * \param vma virtual memory area.
  * \param address access address.
  * \return pointer to the page structure.
  *
  * Determine the page number from the page offset and get it from drm_device_dma::pagelist.
  */
-static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_dma_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
 	struct drm_device_dma *dma = dev->dma;
@@ -315,14 +317,14 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 /**
  * \c fault method for scatter-gather virtual memory.
  *
- * \param vma virtual memory area.
  * \param address access address.
  * \return pointer to the page structure.
  *
  * Determine the map offset from the page offset and get it from drm_sg_mem::pagelist.
  */
-static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_do_vm_sg_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_local_map *map = vma->vm_private_data;
 	struct drm_file *priv = vma->vm_file->private_data;
 	struct drm_device *dev = priv->minor->dev;
@@ -347,24 +349,24 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
-static int drm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_fault(vma, vmf);
+	return drm_do_vm_fault(vmf);
 }
 
-static int drm_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_shm_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_shm_fault(vma, vmf);
+	return drm_do_vm_shm_fault(vmf);
 }
 
-static int drm_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_dma_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_dma_fault(vma, vmf);
+	return drm_do_vm_dma_fault(vmf);
 }
 
-static int drm_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int drm_vm_sg_fault(struct vm_fault *vmf)
 {
-	return drm_do_vm_sg_fault(vma, vmf);
+	return drm_do_vm_sg_fault(vmf);
 }
 
 /** AGP virtual memory operations */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 114dddbd297b..68157d97728e 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -175,8 +175,9 @@ int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return obj->ops->mmap(obj, vma);
 }
 
-int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int etnaviv_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
 	struct page **pages, *page;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index 57b81460fec8..4c28f7ffcc4d 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -447,8 +447,9 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv,
 	return ret;
 }
 
-int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int exynos_drm_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct exynos_drm_gem *exynos_gem = to_exynos_gem(obj);
 	unsigned long pfn;
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h
index df7c543d6558..85457255fcd1 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.h
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h
@@ -116,7 +116,7 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv,
 				   uint64_t *offset);
 
 /* page fault handler and mmap fault address(virtual) to physical memory. */
-int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int exynos_drm_gem_fault(struct vm_fault *vmf);
 
 /* set vm_flags and we can change the vm attribute to other one at here. */
 int exynos_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 8b44fa542562..71f7045455af 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -111,8 +111,9 @@ static int psbfb_pan(struct fb_var_screeninfo *var, struct fb_info *info)
         return 0;
 }
 
-static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int psbfb_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct psb_framebuffer *psbfb = vma->vm_private_data;
 	struct drm_device *dev = psbfb->base.dev;
 	struct drm_psb_private *dev_priv = dev->dev_private;
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 527c62917660..7da061aab729 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -164,8 +164,9 @@ int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
  *	vma->vm_private_data points to the GEM object that is backing this
  *	mapping.
  */
-int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int psb_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj;
 	struct gtt_range *r;
 	int ret;
diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h
index 05d7aaf47eea..83e22fd4cfc0 100644
--- a/drivers/gpu/drm/gma500/psb_drv.h
+++ b/drivers/gpu/drm/gma500/psb_drv.h
@@ -752,7 +752,7 @@ extern int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
 			struct drm_mode_create_dumb *args);
 extern int psb_gem_dumb_map_gtt(struct drm_file *file, struct drm_device *dev,
 			uint32_t handle, uint64_t *offset);
-extern int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int psb_gem_fault(struct vm_fault *vmf);
 
 /* psb_device.c */
 extern const struct psb_ops psb_chip_ops;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8493e19b563a..64f43999a2a1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3105,7 +3105,7 @@ int __must_check i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
 					unsigned int flags);
 int __must_check i915_gem_suspend(struct drm_device *dev);
 void i915_gem_resume(struct drm_device *dev);
-int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int i915_gem_fault(struct vm_fault *vmf);
 int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 			 unsigned int flags,
 			 long timeout,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4b23a7814713..dfc7833e1f7e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1756,7 +1756,6 @@ int i915_gem_mmap_gtt_version(void)
 
 /**
  * i915_gem_fault - fault a page into the GTT
- * @area: CPU VMA in question
  * @vmf: fault info
  *
  * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped
@@ -1773,9 +1772,10 @@ int i915_gem_mmap_gtt_version(void)
  * The current feature set supported by i915_gem_fault() and thus GTT mmaps
  * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version).
  */
-int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+int i915_gem_fault(struct vm_fault *vmf)
 {
 #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */
+	struct vm_area_struct *area = vmf->vma;
 	struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data);
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index ed4dad3ca133..577079b6eaa7 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -206,7 +206,7 @@ void msm_gem_shrinker_cleanup(struct drm_device *dev);
 int msm_gem_mmap_obj(struct drm_gem_object *obj,
 			struct vm_area_struct *vma);
 int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int msm_gem_fault(struct vm_fault *vmf);
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
 int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 		uint64_t *iova);
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 8098677a3916..e8aa76cde055 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -192,8 +192,9 @@ int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return msm_gem_mmap_obj(vma->vm_private_data, vma);
 }
 
-int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int msm_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct drm_device *dev = obj->dev;
 	struct msm_drm_private *priv = dev->dev_private;
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 4a90c690f09e..f9a3cbb95a2f 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -518,7 +518,6 @@ static int fault_2d(struct drm_gem_object *obj,
 
 /**
  * omap_gem_fault		-	pagefault handler for GEM objects
- * @vma: the VMA of the GEM object
  * @vmf: fault detail
  *
  * Invoked when a fault occurs on an mmap of a GEM managed area. GEM
@@ -529,8 +528,9 @@ static int fault_2d(struct drm_gem_object *obj,
  * vma->vm_private_data points to the GEM object that is backing this
  * mapping.
  */
-int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int omap_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *obj = vma->vm_private_data;
 	struct omap_gem_object *omap_obj = to_omap_bo(obj);
 	struct drm_device *dev = obj->dev;
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index 11761330a6b8..060e3e879448 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -106,15 +106,15 @@ static void qxl_ttm_global_fini(struct qxl_device *qdev)
 static struct vm_operations_struct qxl_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops;
 
-static int qxl_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int qxl_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL)
 		return VM_FAULT_NOPAGE;
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	return r;
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 0cf03ccbf0a7..fc583811a172 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -979,19 +979,19 @@ void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size)
 static struct vm_operations_struct radeon_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops = NULL;
 
-static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int radeon_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	struct radeon_device *rdev;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;	
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL) {
 		return VM_FAULT_NOPAGE;
 	}
 	rdev = radeon_get_rdev(bo->bdev);
 	down_read(&rdev->pm.mclk_lock);
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	up_read(&rdev->pm.mclk_lock);
 	return r;
 }
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 7d853e6b5ff0..9f34662f5140 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -441,8 +441,9 @@ int tegra_bo_dumb_map_offset(struct drm_file *file, struct drm_device *drm,
 	return 0;
 }
 
-static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int tegra_bo_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_gem_object *gem = vma->vm_private_data;
 	struct tegra_bo *bo = to_tegra_bo(gem);
 	struct page *page;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 68ef993ab431..247986a1d097 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -43,7 +43,6 @@
 #define TTM_BO_VM_NUM_PREFAULT 16
 
 static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
-				struct vm_area_struct *vma,
 				struct vm_fault *vmf)
 {
 	int ret = 0;
@@ -66,7 +65,7 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
 			goto out_unlock;
 
-		up_read(&vma->vm_mm->mmap_sem);
+		up_read(&vmf->vma->vm_mm->mmap_sem);
 		(void) dma_fence_wait(bo->moving, true);
 		goto out_unlock;
 	}
@@ -89,8 +88,9 @@ out_unlock:
 	return ret;
 }
 
-static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ttm_bo_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
 	    vma->vm_private_data;
 	struct ttm_bo_device *bdev = bo->bdev;
@@ -163,7 +163,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Wait for buffer data in transit, due to a pipelined
 	 * move.
 	 */
-	ret = ttm_bo_vm_fault_idle(bo, vma, vmf);
+	ret = ttm_bo_vm_fault_idle(bo, vmf);
 	if (unlikely(ret != 0)) {
 		retval = ret;
 		goto out_unlock;
diff --git a/drivers/gpu/drm/udl/udl_drv.h b/drivers/gpu/drm/udl/udl_drv.h
index f338a576efc8..06d8f3df4c39 100644
--- a/drivers/gpu/drm/udl/udl_drv.h
+++ b/drivers/gpu/drm/udl/udl_drv.h
@@ -134,7 +134,7 @@ void udl_gem_put_pages(struct udl_gem_object *obj);
 int udl_gem_vmap(struct udl_gem_object *obj);
 void udl_gem_vunmap(struct udl_gem_object *obj);
 int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
-int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+int udl_gem_fault(struct vm_fault *vmf);
 
 int udl_handle_damage(struct udl_framebuffer *fb, int x, int y,
 		      int width, int height);
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c
index 3c0c4bd3f750..775c50e4f02c 100644
--- a/drivers/gpu/drm/udl/udl_gem.c
+++ b/drivers/gpu/drm/udl/udl_gem.c
@@ -100,8 +100,9 @@ int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	return ret;
 }
 
-int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int udl_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct udl_gem_object *obj = to_udl_bo(vma->vm_private_data);
 	struct page *page;
 	unsigned int page_offset;
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 477e07f0ecb6..7ccbb03e98de 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -50,8 +50,9 @@ static void vgem_gem_free_object(struct drm_gem_object *obj)
 	kfree(vgem_obj);
 }
 
-static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int vgem_gem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct drm_vgem_gem_object *obj = vma->vm_private_data;
 	/* We don't use vmf->pgoff since that has the fake offset */
 	unsigned long vaddr = vmf->address;
diff --git a/drivers/gpu/drm/virtio/virtgpu_ttm.c b/drivers/gpu/drm/virtio/virtgpu_ttm.c
index 4a1de9f81193..d9f7aaddc703 100644
--- a/drivers/gpu/drm/virtio/virtgpu_ttm.c
+++ b/drivers/gpu/drm/virtio/virtgpu_ttm.c
@@ -114,18 +114,17 @@ static void virtio_gpu_ttm_global_fini(struct virtio_gpu_device *vgdev)
 static struct vm_operations_struct virtio_gpu_ttm_vm_ops;
 static const struct vm_operations_struct *ttm_vm_ops;
 
-static int virtio_gpu_ttm_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int virtio_gpu_ttm_fault(struct vm_fault *vmf)
 {
 	struct ttm_buffer_object *bo;
 	struct virtio_gpu_device *vgdev;
 	int r;
 
-	bo = (struct ttm_buffer_object *)vma->vm_private_data;
+	bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data;
 	if (bo == NULL)
 		return VM_FAULT_NOPAGE;
 	vgdev = virtio_gpu_get_vgdev(bo->bdev);
-	r = ttm_vm_ops->fault(vma, vmf);
+	r = ttm_vm_ops->fault(vmf);
 	return r;
 }
 #endif
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index e8d55a153a65..e88afe1a435c 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1188,9 +1188,9 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	mutex_unlock(&msc->buf_mutex);
 }
 
-static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int msc_mmap_fault(struct vm_fault *vmf)
 {
-	struct msc_iter *iter = vma->vm_file->private_data;
+	struct msc_iter *iter = vmf->vma->vm_file->private_data;
 	struct msc *msc = iter->msc;
 
 	vmf->page = msc_buffer_get_page(msc, vmf->pgoff);
@@ -1198,7 +1198,7 @@ static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
 	vmf->page->index = vmf->pgoff;
 
 	return 0;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index bd786b7bd30b..f46033984d07 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -92,7 +92,7 @@ static unsigned int poll_next(struct file *, struct poll_table_struct *);
 static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
 static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
 static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
-static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static int vma_fault(struct vm_fault *);
 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 			    unsigned long arg);
 
@@ -695,7 +695,7 @@ done:
  * Local (non-chip) user memory is not mapped right away but as it is
  * accessed by the user-level code.
  */
-static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int vma_fault(struct vm_fault *vmf)
 {
 	struct page *page;
 
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 2d1eacf1dfed..9396c1807cc3 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -893,7 +893,7 @@ bail:
 /*
  * qib_file_vma_fault - handle a VMA page fault.
  */
-static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int qib_file_vma_fault(struct vm_fault *vmf)
 {
 	struct page *page;
 
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index ba63ca57ed7e..36bd904946bd 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -434,8 +434,9 @@ static void videobuf_vm_close(struct vm_area_struct *vma)
  * now ...).  Bounce buffers don't work very well for the data rates
  * video capture has.
  */
-static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int videobuf_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 
 	dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 3907387b6d15..062bf6ca2625 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -121,8 +121,9 @@ void cxl_context_set_mapping(struct cxl_context *ctx,
 	mutex_unlock(&ctx->mapping_lock);
 }
 
-static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cxl_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct cxl_context *ctx = vma->vm_file->private_data;
 	u64 area, offset;
 
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index af2e077da4b8..3641f1334cf0 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -926,8 +926,9 @@ again:
  *
  * 	Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries.
  */
-int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int gru_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct gru_thread_state *gts;
 	unsigned long paddr, vaddr;
 	unsigned long expires;
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
index 5c3ce2459675..b5e308b50ed1 100644
--- a/drivers/misc/sgi-gru/grutables.h
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -665,7 +665,7 @@ extern unsigned long gru_reserve_cb_resources(struct gru_state *gru,
 		int cbr_au_count, char *cbmap);
 extern unsigned long gru_reserve_ds_resources(struct gru_state *gru,
 		int dsr_au_count, char *dsmap);
-extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf);
+extern int gru_fault(struct vm_fault *vmf);
 extern struct gru_mm_struct *gru_register_mmu_notifier(void);
 extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms);
 
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index 9636970d9611..1df05b12c6cb 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -1045,7 +1045,6 @@ out:
 
 /**
  * cxlflash_mmap_fault() - mmap fault handler for adapter file descriptor
- * @vma:	VM area associated with mapping.
  * @vmf:	VM fault associated with current fault.
  *
  * To support error notification via MMIO, faults are 'caught' by this routine
@@ -1059,8 +1058,9 @@ out:
  *
  * Return: 0 on success, VM_FAULT_SIGBUS on failure
  */
-static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int cxlflash_mmap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct file *file = vma->vm_file;
 	struct cxl_context *ctx = cxl_fops_get_context(file);
 	struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg,
@@ -1089,7 +1089,7 @@ static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	if (likely(!ctxi->err_recovery_active)) {
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		rc = ctxi->cxl_mmap_vmops->fault(vma, vmf);
+		rc = ctxi->cxl_mmap_vmops->fault(vmf);
 	} else {
 		dev_dbg(dev, "%s: err recovery active, use err_page!\n",
 			__func__);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index dbe5b4b95df0..dd0cab049872 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1187,8 +1187,9 @@ sg_fasync(int fd, struct file *filp, int mode)
 }
 
 static int
-sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+sg_vma_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	Sg_fd *sfp;
 	unsigned long offset, len, sa;
 	Sg_scatter_hold *rsv_schp;
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index 0fb0e28ace70..00bb4692ace3 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -870,9 +870,9 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 	mutex_unlock(&buffer->lock);
 }
 
-static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ion_vm_fault(struct vm_fault *vmf)
 {
-	struct ion_buffer *buffer = vma->vm_private_data;
+	struct ion_buffer *buffer = vmf->vma->vm_private_data;
 	unsigned long pfn;
 	int ret;
 
@@ -881,7 +881,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
 
 	pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
-	ret = vm_insert_pfn(vma, vmf->address, pfn);
+	ret = vm_insert_pfn(vmf->vma, vmf->address, pfn);
 	mutex_unlock(&buffer->lock);
 	if (ret)
 		return VM_FAULT_ERROR;
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index 9afa6bec3e6f..896196c74cd2 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -321,7 +321,7 @@ out:
 	return fault_ret;
 }
 
-static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_fault(struct vm_fault *vmf)
 {
 	int count = 0;
 	bool printed = false;
@@ -335,7 +335,7 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM));
 
 restart:
-	result = ll_fault0(vma, vmf);
+	result = ll_fault0(vmf->vma, vmf);
 	LASSERT(!(result & VM_FAULT_LOCKED));
 	if (result == 0) {
 		struct page *vmpage = vmf->page;
@@ -362,8 +362,9 @@ restart:
 	return result;
 }
 
-static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ll_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	int count = 0;
 	bool printed = false;
 	bool retry;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 697cbfbe9374..99f995c27f15 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1006,7 +1006,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 {
 	struct vm_fault *vmf = cfio->ft_vmf;
 
-	cfio->ft_flags = filemap_fault(cfio->ft_vma, vmf);
+	cfio->ft_flags = filemap_fault(vmf);
 	cfio->ft_flags_valid = 1;
 
 	if (vmf->page) {
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 8041710b6972..5c1cb2df3a54 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -783,15 +783,15 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
 	return -1;
 }
 
-static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int tcmu_vma_fault(struct vm_fault *vmf)
 {
-	struct tcmu_dev *udev = vma->vm_private_data;
+	struct tcmu_dev *udev = vmf->vma->vm_private_data;
 	struct uio_info *info = &udev->uio_info;
 	struct page *page;
 	unsigned long offset;
 	void *addr;
 
-	int mi = tcmu_find_mem_index(vma);
+	int mi = tcmu_find_mem_index(vmf->vma);
 	if (mi < 0)
 		return VM_FAULT_SIGBUS;
 
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index fba021f5736a..31d95dc9c202 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -597,14 +597,14 @@ static int uio_find_mem_index(struct vm_area_struct *vma)
 	return -1;
 }
 
-static int uio_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int uio_vma_fault(struct vm_fault *vmf)
 {
-	struct uio_device *idev = vma->vm_private_data;
+	struct uio_device *idev = vmf->vma->vm_private_data;
 	struct page *page;
 	unsigned long offset;
 	void *addr;
 
-	int mi = uio_find_mem_index(vma);
+	int mi = uio_find_mem_index(vmf->vma);
 	if (mi < 0)
 		return VM_FAULT_SIGBUS;
 
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 91c22276c03b..9fb8b1e6ecc2 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1223,9 +1223,9 @@ static void mon_bin_vma_close(struct vm_area_struct *vma)
 /*
  * Map ring pages to user space.
  */
-static int mon_bin_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int mon_bin_vma_fault(struct vm_fault *vmf)
 {
-	struct mon_reader_bin *rp = vma->vm_private_data;
+	struct mon_reader_bin *rp = vmf->vma->vm_private_data;
 	unsigned long offset, chunk_idx;
 	struct page *pageptr;
 
diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c
index 74b5bcac8bf2..37f69c061210 100644
--- a/drivers/video/fbdev/core/fb_defio.c
+++ b/drivers/video/fbdev/core/fb_defio.c
@@ -37,12 +37,11 @@ static struct page *fb_deferred_io_page(struct fb_info *info, unsigned long offs
 }
 
 /* this is to find and return the vmalloc-ed fb pages */
-static int fb_deferred_io_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int fb_deferred_io_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page *page;
-	struct fb_info *info = vma->vm_private_data;
+	struct fb_info *info = vmf->vma->vm_private_data;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
 	if (offset >= info->fix.smem_len)
@@ -54,8 +53,8 @@ static int fb_deferred_io_fault(struct vm_area_struct *vma,
 
 	get_page(page);
 
-	if (vma->vm_file)
-		page->mapping = vma->vm_file->f_mapping;
+	if (vmf->vma->vm_file)
+		page->mapping = vmf->vma->vm_file->f_mapping;
 	else
 		printk(KERN_ERR "no mapping available\n");
 
@@ -91,11 +90,10 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy
 EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
-static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct vm_fault *vmf)
+static int fb_deferred_io_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct fb_info *info = vma->vm_private_data;
+	struct fb_info *info = vmf->vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
 
@@ -105,7 +103,7 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
 	deferred framebuffer IO. then if userspace touches a page
 	again, we repeat the same scheme */
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	/* protect against the workqueue changing the page list */
 	mutex_lock(&fbdefio->lock);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 6e3306f4a525..fae21a0f1cee 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -598,10 +598,10 @@ static void privcmd_close(struct vm_area_struct *vma)
 	kfree(pages);
 }
 
-static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int privcmd_fault(struct vm_fault *vmf)
 {
 	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
-	       vma, vma->vm_start, vma->vm_end,
+	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
 	       vmf->pgoff, (void *)vmf->address);
 
 	return VM_FAULT_SIGBUS;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6a0f3fa85ef7..3de3b4a89d89 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -534,11 +534,11 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
 }
 
 static int
-v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+v9fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct v9fs_inode *v9inode;
 	struct page *page = vmf->page;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
 
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6a823719b6c5..adf16307842a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3147,7 +3147,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+int btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..ea1d500cfba6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8964,10 +8964,10 @@ again:
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int btrfs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -9000,7 +9000,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ret = btrfs_delalloc_reserve_space(inode, page_start,
 					   reserved_space);
 	if (!ret) {
-		ret = file_update_time(vma->vm_file);
+		ret = file_update_time(vmf->vma->vm_file);
 		reserved = 1;
 	}
 	if (ret) {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e4b066cd912a..09860c0ec7c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1386,8 +1386,9 @@ static void ceph_restore_sigs(sigset_t *oldset)
 /*
  * vm ops
  */
-static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ceph_filemap_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
@@ -1416,7 +1417,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 	    ci->i_inline_version == CEPH_INLINE_NONE) {
 		current->journal_info = vma->vm_file;
-		ret = filemap_fault(vma, vmf);
+		ret = filemap_fault(vmf);
 		current->journal_info = NULL;
 	} else
 		ret = -EAGAIN;
@@ -1477,8 +1478,9 @@ out_restore:
 /*
  * Reuse write_begin here for simplicity.
  */
-static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ceph_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 18a1e1d6671f..82b9f4053603 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3254,7 +3254,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
  * sure that it doesn't change while being written back.
  */
 static int
-cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+cifs_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
 
diff --git a/fs/dax.c b/fs/dax.c
index 26c2dc3aede0..e316cb617865 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -925,12 +925,11 @@ static int dax_insert_mapping(struct address_space *mapping,
 
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
- * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  */
-int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
@@ -1117,7 +1116,6 @@ static int dax_fault_return(int error)
 
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @ops: iomap ops passed from the file system
  *
@@ -1125,10 +1123,9 @@ static int dax_fault_return(int error)
  * or mkwrite handler for DAX files. Assumes the caller has done all the
  * necessary locking for the page fault to proceed successfully.
  */
-int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
@@ -1201,11 +1198,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
-				PAGE_SIZE, &entry, vma, vmf);
+				PAGE_SIZE, &entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b0f241528a30..0bf0d971205a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -87,19 +87,19 @@ out_unlock:
  * The default page_lock and i_size verification done by non-DAX fault paths
  * is sufficient because ext2 doesn't support hole punching.
  */
-static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext2_dax_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	int ret;
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,16 +107,15 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
+static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	loff_t size;
 	int ret;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	down_read(&ei->dax_sem);
 
 	/* check that the faulting page hasn't raced with truncate */
@@ -124,7 +123,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2163c1e69f2a..2812bcf6ba24 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2491,8 +2491,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_page_mkwrite(struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 75dc3dd531d7..cc0b1113276d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -255,19 +255,19 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext4_dax_fault(struct vm_fault *vmf)
 {
 	int result;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -305,22 +305,21 @@ ext4_dax_pmd_fault(struct vm_fault *vmf)
  * wp_pfn_shared() fails. Thus fault gets retried and things work out as
  * desired.
  */
-static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	loff_t size;
 	int ret;
 
 	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	down_read(&EXT4_I(inode)->i_mmap_sem);
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(sb);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88d57af1b516..fb6fd97b4cfc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5776,8 +5776,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int ext4_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
@@ -5867,13 +5868,13 @@ out:
 	return ret;
 }
 
-int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int ext4_filemap_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	int err;
 
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	err = filemap_fault(vma, vmf);
+	err = filemap_fault(vmf);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return err;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 49f10dce817d..1edc86e874e3 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -32,11 +32,10 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
-static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
-						struct vm_fault *vmf)
+static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct dnode_of_data dn;
 	int err;
@@ -58,7 +57,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 
 	f2fs_balance_fs(sbi, dn.node_changed);
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (unlikely(page->mapping != inode->i_mapping ||
 			page_offset(page) > i_size_read(inode) ||
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2401c5dabb2a..e80bfd06daf5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2043,12 +2043,12 @@ static void fuse_vma_close(struct vm_area_struct *vma)
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int fuse_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 016c11eaca7c..6fe2a59c6a9a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -379,10 +379,10 @@ static int gfs2_allocate_page_backing(struct page *page)
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int gfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_alloc_parms ap = { .aflags = 0, };
@@ -399,7 +399,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out;
 
-	gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE);
+	gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -407,7 +407,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_uninit;
 
 	/* Update file times before taking page lock */
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index a51cb4c07d4d..ccba165bed49 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -445,11 +445,10 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 	return length;
 }
 
-int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+int iomap_page_mkwrite(struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	unsigned long length;
 	loff_t offset, size;
 	ssize_t ret;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 78219d5644e9..4f0535890b30 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -348,9 +348,9 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
 	kernfs_put_active(of->kn);
 }
 
-static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kernfs_vma_fault(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
@@ -362,16 +362,15 @@ static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	ret = VM_FAULT_SIGBUS;
 	if (of->vm_ops->fault)
-		ret = of->vm_ops->fault(vma, vmf);
+		ret = of->vm_ops->fault(vmf);
 
 	kernfs_put_active(of->kn);
 	return ret;
 }
 
-static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
-				   struct vm_fault *vmf)
+static int kernfs_vma_page_mkwrite(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct kernfs_open_file *of = kernfs_of(file);
 	int ret;
 
@@ -383,7 +382,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
 
 	ret = 0;
 	if (of->vm_ops->page_mkwrite)
-		ret = of->vm_ops->page_mkwrite(vma, vmf);
+		ret = of->vm_ops->page_mkwrite(vmf);
 	else
 		file_update_time(file);
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 39f57bef8531..0c3905e0542e 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -27,10 +27,9 @@
  * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
  * page?
  */
-static int ncp_file_mmap_fault(struct vm_area_struct *area,
-					struct vm_fault *vmf)
+static int ncp_file_mmap_fault(struct vm_fault *vmf)
 {
-	struct inode *inode = file_inode(area->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	char *pg_addr;
 	unsigned int already_read;
 	unsigned int count;
@@ -90,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
 	 * -- nyc
 	 */
 	count_vm_event(PGMAJFAULT);
-	mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
+	mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 	return VM_FAULT_MAJOR;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 26dbe8b0c10d..668213984d68 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -528,10 +528,10 @@ const struct address_space_operations nfs_file_aops = {
  * writable, implying that someone is about to modify the page through a
  * shared-writable mapping
  */
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int nfs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct file *filp = vma->vm_file;
+	struct file *filp = vmf->vma->vm_file;
 	struct inode *inode = file_inode(filp);
 	unsigned pagelen;
 	int ret = VM_FAULT_NOPAGE;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 547381f3ce13..c5fa3dee72fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -51,8 +51,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	return err;
 }
 
-static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int nilfs_page_mkwrite(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct nilfs_transaction_info ti;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 429088786e93..098f5c712569 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -44,17 +44,18 @@
 #include "ocfs2_trace.h"
 
 
-static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static int ocfs2_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	sigset_t oldset;
 	int ret;
 
 	ocfs2_block_signals(&oldset);
-	ret = filemap_fault(area, vmf);
+	ret = filemap_fault(vmf);
 	ocfs2_unblock_signals(&oldset);
 
-	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
-			  area, vmf->page, vmf->pgoff);
+	trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno,
+			  vma, vmf->page, vmf->pgoff);
 	return ret;
 }
 
@@ -127,10 +128,10 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ocfs2_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct buffer_head *di_bh = NULL;
 	sigset_t oldset;
 	int ret;
@@ -160,7 +161,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
+	ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5105b1599981..f52d8e857ff7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -265,10 +265,10 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
  * On s390 the fault handler is used for memory regions that can't be mapped
  * directly with remap_pfn_range().
  */
-static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int mmap_vmcore_fault(struct vm_fault *vmf)
 {
 #ifdef CONFIG_S390
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	pgoff_t index = vmf->pgoff;
 	struct page *page;
 	loff_t offset;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b0d783774c96..d9ae86f96df7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1506,11 +1506,10 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made writable.
  * UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
-				 struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
 	struct ubifs_budget_req req = { .new_page = 1 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9d8440b07b53..f1849817d237 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1373,22 +1373,21 @@ xfs_file_llseek(
  */
 STATIC int
 xfs_filemap_page_mkwrite(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	int			ret;
 
 	trace_xfs_filemap_page_mkwrite(XFS_I(inode));
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	} else {
-		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
+		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
 	}
 
@@ -1400,23 +1399,22 @@ xfs_filemap_page_mkwrite(
 
 STATIC int
 xfs_filemap_fault(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	int			ret;
 
 	trace_xfs_filemap_fault(XFS_I(inode));
 
 	/* DAX can shortcut the normal fault path on write faults! */
 	if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
-		return xfs_filemap_page_mkwrite(vma, vmf);
+		return xfs_filemap_page_mkwrite(vmf);
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	else
-		ret = filemap_fault(vma, vmf);
+		ret = filemap_fault(vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	return ret;
@@ -1465,11 +1463,10 @@ xfs_filemap_pmd_fault(
  */
 static int
 xfs_filemap_pfn_mkwrite(
-	struct vm_area_struct	*vma,
 	struct vm_fault		*vmf)
 {
 
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret = VM_FAULT_NOPAGE;
 	loff_t			size;
@@ -1477,7 +1474,7 @@ xfs_filemap_pfn_mkwrite(
 	trace_xfs_filemap_pfn_mkwrite(ip);
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 
 	/* check if the faulting page hasn't raced with truncate */
 	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
@@ -1485,7 +1482,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_pfn_mkwrite(vma, vmf);
+		ret = dax_pfn_mkwrite(vmf);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index c1bd6ab5e974..44177005262d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,8 +38,7 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
@@ -83,7 +82,7 @@ static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
 	return VM_FAULT_FALLBACK;
 }
 #endif
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+int dax_pfn_mkwrite(struct vm_fault *vmf);
 
 static inline bool vma_is_dax(struct vm_area_struct *vma)
 {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index a4c94b86401e..857e440af9d5 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -79,8 +79,7 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, struct iomap_ops *ops);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		struct iomap_ops *ops);
-int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_fault *vmf, struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		loff_t start, loff_t len, struct iomap_ops *ops);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d8e585562cce..da46c5d41d21 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -346,17 +346,17 @@ struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
-	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*fault)(struct vm_fault *vmf);
 	int (*pmd_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*page_mkwrite)(struct vm_fault *vmf);
 
 	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
-	int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+	int (*pfn_mkwrite)(struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
@@ -2120,10 +2120,10 @@ extern void truncate_inode_pages_range(struct address_space *,
 extern void truncate_inode_pages_final(struct address_space *);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+extern int filemap_fault(struct vm_fault *vmf);
 extern void filemap_map_pages(struct vm_fault *vmf,
 		pgoff_t start_pgoff, pgoff_t end_pgoff);
-extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int filemap_page_mkwrite(struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff --git a/ipc/shm.c b/ipc/shm.c
index 81203e8ba013..7f6537b84ef5 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -374,12 +374,12 @@ void exit_shm(struct task_struct *task)
 	up_write(&shm_ids(ns).rwsem);
 }
 
-static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shm_fault(struct vm_fault *vmf)
 {
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->fault(vma, vmf);
+	return sfd->vm_ops->fault(vmf);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5aaa806702d..9a897c3b4849 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4869,9 +4869,9 @@ unlock:
 	rcu_read_unlock();
 }
 
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int perf_mmap_fault(struct vm_fault *vmf)
 {
-	struct perf_event *event = vma->vm_file->private_data;
+	struct perf_event *event = vmf->vma->vm_file->private_data;
 	struct ring_buffer *rb;
 	int ret = VM_FAULT_SIGBUS;
 
@@ -4894,7 +4894,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto unlock;
 
 	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->mapping = vmf->vma->vm_file->f_mapping;
 	vmf->page->index   = vmf->pgoff;
 
 	ret = 0;
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f18d314a96a..8f8dc91db680 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
 /*
  * fault() vm_op implementation for relay file mapping.
  */
-static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int relay_buf_fault(struct vm_fault *vmf)
 {
 	struct page *page;
-	struct rchan_buf *buf = vma->vm_private_data;
+	struct rchan_buf *buf = vmf->vma->vm_private_data;
 	pgoff_t pgoff = vmf->pgoff;
 
 	if (!buf)
diff --git a/mm/filemap.c b/mm/filemap.c
index 416d563468a3..2ba46f410c7c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2169,7 +2169,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
 
 /**
  * filemap_fault - read in file data for page fault handling
- * @vma:	vma in which the fault was taken
  * @vmf:	struct vm_fault containing details of the fault
  *
  * filemap_fault() is invoked via the vma operations vector for a
@@ -2191,10 +2190,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
  *
  * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
  */
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
 {
 	int error;
-	struct file *file = vma->vm_file;
+	struct file *file = vmf->vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
@@ -2216,12 +2215,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
-		do_async_mmap_readahead(vma, ra, file, page, offset);
+		do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
 	} else if (!page) {
 		/* No page in the page cache at all */
-		do_sync_mmap_readahead(vma, ra, file, offset);
+		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+		mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 retry_find:
 		page = find_get_page(mapping, offset);
@@ -2229,7 +2228,7 @@ retry_find:
 			goto no_cached_page;
 	}
 
-	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+	if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
 		put_page(page);
 		return ret | VM_FAULT_RETRY;
 	}
@@ -2396,14 +2395,14 @@ next:
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_page_mkwrite(struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	int ret = VM_FAULT_LOCKED;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	file_update_time(vmf->vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f6c7ff316daf..1b8789a2395d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3142,7 +3142,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
  * this far.
  */
-static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int hugetlb_vm_op_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 7663068a33c6..cf97d88158cd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2035,7 +2035,7 @@ static int do_page_mkwrite(struct vm_fault *vmf)
 
 	vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 
-	ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
 	/* Restore original flags so that caller is not surprised */
 	vmf->flags = old_flags;
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2307,7 +2307,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
 
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		vmf->flags |= FAULT_FLAG_MKWRITE;
-		ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+		ret = vma->vm_ops->pfn_mkwrite(vmf);
 		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
 			return ret;
 		return finish_mkwrite_fault(vmf);
@@ -2861,7 +2861,7 @@ static int __do_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	int ret;
 
-	ret = vma->vm_ops->fault(vma, vmf);
+	ret = vma->vm_ops->fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
 			    VM_FAULT_DONE_COW)))
 		return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index b729084eea90..1cd70010edf0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3125,8 +3125,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 		mm->data_vm += npages;
 }
 
-static int special_mapping_fault(struct vm_area_struct *vma,
-				 struct vm_fault *vmf);
+static int special_mapping_fault(struct vm_fault *vmf);
 
 /*
  * Having a close hook prevents vma merging regardless of flags.
@@ -3161,9 +3160,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
 	.fault = special_mapping_fault,
 };
 
-static int special_mapping_fault(struct vm_area_struct *vma,
-				struct vm_fault *vmf)
+static int special_mapping_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	pgoff_t pgoff;
 	struct page **pages;
 
@@ -3173,7 +3172,7 @@ static int special_mapping_fault(struct vm_area_struct *vma,
 		struct vm_special_mapping *sm = vma->vm_private_data;
 
 		if (sm->fault)
-			return sm->fault(sm, vma, vmf);
+			return sm->fault(sm, vmf->vma, vmf);
 
 		pages = sm->pages;
 	}
diff --git a/mm/nommu.c b/mm/nommu.c
index bc964c26be8c..62600ba0d1d8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1794,7 +1794,7 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+int filemap_fault(struct vm_fault *vmf)
 {
 	BUG();
 	return 0;
diff --git a/mm/shmem.c b/mm/shmem.c
index 9c6d22ff44e2..f7f2330c6cb6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1908,8 +1908,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync
 	return ret;
 }
 
-static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int shmem_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = file_inode(vma->vm_file);
 	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 	enum sgp_type sgp;
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index cf9293e01fc1..7573f8404e34 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -439,10 +439,9 @@ out:
 	return ret;
 }
 
-static int sel_mmap_policy_fault(struct vm_area_struct *vma,
-				 struct vm_fault *vmf)
+static int sel_mmap_policy_fault(struct vm_fault *vmf)
 {
-	struct policy_load_memory *plm = vma->vm_file->private_data;
+	struct policy_load_memory *plm = vmf->vma->vm_file->private_data;
 	unsigned long offset;
 	struct page *page;
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 9d33c1e85c79..aec9c92250fd 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3245,10 +3245,9 @@ static unsigned int snd_pcm_capture_poll(struct file *file, poll_table * wait)
 /*
  * mmap status record
  */
-static int snd_pcm_mmap_status_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_status_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	
 	if (substream == NULL)
@@ -3282,10 +3281,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file
 /*
  * mmap control record
  */
-static int snd_pcm_mmap_control_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_control_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	
 	if (substream == NULL)
@@ -3341,10 +3339,9 @@ snd_pcm_default_page_ops(struct snd_pcm_substream *substream, unsigned long ofs)
 /*
  * fault callback for mmapping a RAM page
  */
-static int snd_pcm_mmap_data_fault(struct vm_area_struct *area,
-						struct vm_fault *vmf)
+static int snd_pcm_mmap_data_fault(struct vm_fault *vmf)
 {
-	struct snd_pcm_substream *substream = area->vm_private_data;
+	struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
 	struct snd_pcm_runtime *runtime;
 	unsigned long offset;
 	struct page * page;
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index cf5dc33f4a6d..cf45bf1f7ee0 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -137,13 +137,12 @@ static void usb_stream_hwdep_vm_open(struct vm_area_struct *area)
 	snd_printdd(KERN_DEBUG "%i\n", atomic_read(&us122l->mmap_count));
 }
 
-static int usb_stream_hwdep_vm_fault(struct vm_area_struct *area,
-				     struct vm_fault *vmf)
+static int usb_stream_hwdep_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page *page;
 	void *vaddr;
-	struct us122l *us122l = area->vm_private_data;
+	struct us122l *us122l = vmf->vma->vm_private_data;
 	struct usb_stream *s;
 
 	mutex_lock(&us122l->mutex);
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 0b34dbc8f302..605e1047c01d 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -31,19 +31,18 @@
 #include "usbusx2y.h"
 #include "usX2Yhwdep.h"
 
-static int snd_us428ctls_vm_fault(struct vm_area_struct *area,
-				  struct vm_fault *vmf)
+static int snd_us428ctls_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	struct page * page;
 	void *vaddr;
 
 	snd_printdd("ENTER, start %lXh, pgoff %ld\n",
-		   area->vm_start,
+		   vmf->vma->vm_start,
 		   vmf->pgoff);
 	
 	offset = vmf->pgoff << PAGE_SHIFT;
-	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->us428ctls_sharedmem + offset;
+	vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->us428ctls_sharedmem + offset;
 	page = virt_to_page(vaddr);
 	get_page(page);
 	vmf->page = page;
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 90766a92e7fd..f95164b91152 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -652,14 +652,13 @@ static void snd_usX2Y_hwdep_pcm_vm_close(struct vm_area_struct *area)
 }
 
 
-static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_area_struct *area,
-					struct vm_fault *vmf)
+static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_fault *vmf)
 {
 	unsigned long offset;
 	void *vaddr;
 
 	offset = vmf->pgoff << PAGE_SHIFT;
-	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->hwdep_pcm_shm + offset;
+	vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->hwdep_pcm_shm + offset;
 	vmf->page = virt_to_page(vaddr);
 	get_page(vmf->page);
 	return 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482612b4e496..7556e1d56708 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2348,9 +2348,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
-static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int kvm_vcpu_fault(struct vm_fault *vmf)
 {
-	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
+	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
 	struct page *page;
 
 	if (vmf->pgoff == 0)
-- 
cgit v1.2.3


From 47d82ae8b57ab94c879e6944a8fbe88286cdad34 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Feb 2017 10:27:20 +1100
Subject: mm,fs,dax: change ->pmd_fault to ->huge_fault

Patch series "1G transparent hugepage support for device dax", v2.

The following series implements support for 1G trasparent hugepage on x86
for device dax.  The bulk of the code was written by Mathew Wilcox a while
back supporting transparent 1G hugepage for fs DAX.  I have forward ported
the relevant bits to 4.10-rc.  The current submission has only the
necessary code to support device DAX.

Comments from Dan Williams: So the motivation and intended user of this
functionality mirrors the motivation and users of 1GB page support in
hugetlbfs.  Given expected capacities of persistent memory devices an
in-memory database may want to reduce tlb pressure beyond what they can
already achieve with 2MB mappings of a device-dax file.  We have customer
feedback to that effect as Willy mentioned in his previous version of
these patches [1].

[1]: https://lkml.org/lkml/2016/1/31/52

Comments from Nilesh @ Oracle:

There are applications which have a process model; and if you assume
10,000 processes attempting to mmap all the 6TB memory available on a
server; we are looking at the following:

processes         : 10,000
memory            :    6TB
pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB
pmd @ 2M page size: 120,000 / 512 = ~240GB
pud @ 1G page size: 240GB / 512 = ~480MB

As you can see with 2M pages, this system will use up an exorbitant amount
of DRAM to hold the page tables; but the 1G pages finally brings it down
to a reasonable level.  Memory sizes will keep increasing; so this number
will keep increasing.

An argument can be made to convert the applications from process model to
thread model, but in the real world that may not be always practical.
Hopefully this helps explain the use case where this is valuable.

This patch (of 3):

In preparation for adding the ability to handle PUD pages, convert
vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault.  The
vm_fault structure is extended to include a union of the different page
table pointers that may be needed, and three flag bits are reserved to
indicate which type of pointer is in the union.

Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/dax.c   | 34 +++++++++++++---------------------
 fs/dax.c            | 43 ++++++++++++++++++++++++++++++-------------
 fs/ext2/file.c      |  2 +-
 fs/ext4/file.c      |  6 +++---
 fs/xfs/xfs_file.c   | 10 +++++-----
 fs/xfs/xfs_trace.h  |  2 +-
 include/linux/dax.h |  6 ------
 include/linux/mm.h  | 10 +++++++++-
 mm/memory.c         | 14 ++++++++------
 9 files changed, 70 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 0261f332bf3e..922ec461dcaa 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
 	return -1;
 }
 
-static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
+static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	struct device *dev = &dax_dev->dev;
 	struct dax_region *dax_region;
@@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 	return VM_FAULT_NOPAGE;
 }
 
-static int dax_dev_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	int rc;
-	struct file *filp = vma->vm_file;
-	struct dax_dev *dax_dev = filp->private_data;
-
-	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
-			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read", vma->vm_start, vma->vm_end);
-	rcu_read_lock();
-	rc = __dax_dev_fault(dax_dev, vmf);
-	rcu_read_unlock();
-
-	return rc;
-}
-
 static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 {
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 			vmf->flags & FAULT_FLAG_WRITE);
 }
 
-static int dax_dev_pmd_fault(struct vm_fault *vmf)
+static int dax_dev_fault(struct vm_fault *vmf)
 {
 	int rc;
 	struct file *filp = vmf->vma->vm_file;
@@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
 			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	rc = __dax_dev_pmd_fault(dax_dev, vmf);
+	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+	case FAULT_FLAG_SIZE_PTE:
+		rc = __dax_dev_pte_fault(dax_dev, vmf);
+		break;
+	case FAULT_FLAG_SIZE_PMD:
+		rc = __dax_dev_pmd_fault(dax_dev, vmf);
+		break;
+	default:
+		return VM_FAULT_FALLBACK;
+	}
 	rcu_read_unlock();
 
 	return rc;
@@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf)
 
 static const struct vm_operations_struct dax_dev_vm_ops = {
 	.fault = dax_dev_fault,
-	.pmd_fault = dax_dev_pmd_fault,
+	.huge_fault = dax_dev_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index e316cb617865..25f791d0afee 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1114,16 +1114,7 @@ static int dax_fault_return(int error)
 	return VM_FAULT_SIGBUS;
 }
 
-/**
- * dax_iomap_fault - handle a page fault on a DAX file
- * @vmf: The description of the fault
- * @ops: iomap ops passed from the file system
- *
- * When a page fault occurs, filesystems may call this helper in their fault
- * or mkwrite handler for DAX files. Assumes the caller has done all the
- * necessary locking for the page fault to proceed successfully.
- */
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+static int dax_iomap_pte_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1240,7 +1231,6 @@ int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 	}
 	return vmf_ret;
 }
-EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
 #ifdef CONFIG_FS_DAX_PMD
 /*
@@ -1331,7 +1321,7 @@ fallback:
 	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -1439,5 +1429,32 @@ out:
 	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	return result;
 }
-EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#else
+static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+{
+	return VM_FAULT_FALLBACK;
+}
 #endif /* CONFIG_FS_DAX_PMD */
+
+/**
+ * dax_iomap_fault - handle a page fault on a DAX file
+ * @vmf: The description of the fault
+ * @ops: iomap ops passed from the file system
+ *
+ * When a page fault occurs, filesystems may call this helper in
+ * their fault handler for DAX files. dax_iomap_fault() assumes the caller
+ * has done all the necessary locking for page fault to proceed
+ * successfully.
+ */
+int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+{
+	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
+	case FAULT_FLAG_SIZE_PTE:
+		return dax_iomap_pte_fault(vmf, ops);
+	case FAULT_FLAG_SIZE_PMD:
+		return dax_iomap_pmd_fault(vmf, ops);
+	default:
+		return VM_FAULT_FALLBACK;
+	}
+}
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 0bf0d971205a..68738832beda 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	/*
-	 * .pmd_fault is not supported for DAX because allocation in ext2
+	 * .huge_fault is not supported for DAX because allocation in ext2
 	 * cannot be reliably aligned to huge page sizes and so pmd faults
 	 * will always fail and fail back to regular faults.
 	 */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cc0b1113276d..ed22d20e843c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -276,7 +276,7 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 }
 
 static int
-ext4_dax_pmd_fault(struct vm_fault *vmf)
+ext4_dax_huge_fault(struct vm_fault *vmf)
 {
 	int result;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -288,7 +288,7 @@ ext4_dax_pmd_fault(struct vm_fault *vmf)
 		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -328,7 +328,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
-	.pmd_fault	= ext4_dax_pmd_fault,
+	.huge_fault	= ext4_dax_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f1849817d237..08accbe84ef1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1423,12 +1423,12 @@ xfs_filemap_fault(
 /*
  * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
  * both read and write faults. Hence we need to handle both cases. There is no
- * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * ->huge_mkwrite callout for huge pages, so we have a single function here to
  * handle both cases here. @flags carries the information on the type of fault
  * occuring.
  */
 STATIC int
-xfs_filemap_pmd_fault(
+xfs_filemap_huge_fault(
 	struct vm_fault		*vmf)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
@@ -1438,7 +1438,7 @@ xfs_filemap_pmd_fault(
 	if (!IS_DAX(inode))
 		return VM_FAULT_FALLBACK;
 
-	trace_xfs_filemap_pmd_fault(ip);
+	trace_xfs_filemap_huge_fault(ip);
 
 	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
@@ -1446,7 +1446,7 @@ xfs_filemap_pmd_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
+	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -1491,7 +1491,7 @@ xfs_filemap_pfn_mkwrite(
 
 static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
-	.pmd_fault	= xfs_filemap_pmd_fault,
+	.huge_fault	= xfs_filemap_huge_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 69c5bcd9a51b..719b1d4fe586 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,7 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
 DEFINE_INODE_EVENT(xfs_filemap_fault);
-DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
+DEFINE_INODE_EVENT(xfs_filemap_huge_fault);
 DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 44177005262d..a3bfa264f0ad 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry)
 		return PMD_SHIFT - PAGE_SHIFT;
 	return 0;
 }
-int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops);
 #else
 static inline unsigned int dax_radix_order(void *entry)
 {
 	return 0;
 }
-static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
-		struct iomap_ops *ops)
-{
-	return VM_FAULT_FALLBACK;
-}
 #endif
 int dax_pfn_mkwrite(struct vm_fault *vmf);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da46c5d41d21..e16c6f7d9c33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -281,6 +281,11 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
+#define FAULT_FLAG_SIZE_MASK	0x700	/* Support up to 8-level page tables */
+#define FAULT_FLAG_SIZE_PTE	0x000	/* First level (eg 4k) */
+#define FAULT_FLAG_SIZE_PMD	0x100	/* Second level (eg 2MB) */
+#define FAULT_FLAG_SIZE_PUD	0x200	/* Third level (eg 1GB) */
+
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
 	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
@@ -310,6 +315,9 @@ struct vm_fault {
 	unsigned long address;		/* Faulting virtual address */
 	pmd_t *pmd;			/* Pointer to pmd entry matching
 					 * the 'address' */
+	pud_t *pud;			/* Pointer to pud entry matching
+					 * the 'address'
+					 */
 	pte_t orig_pte;			/* Value of PTE at the time of fault */
 
 	struct page *cow_page;		/* Page handler may use for COW fault */
@@ -347,7 +355,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
-	int (*pmd_fault)(struct vm_fault *vmf);
+	int (*huge_fault)(struct vm_fault *vmf);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/mm/memory.c b/mm/memory.c
index cf97d88158cd..baef119c3278 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf)
 {
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
-	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf);
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
-	if (vmf->vma->vm_ops->pmd_fault)
-		return vmf->vma->vm_ops->pmd_fault(vmf);
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	pud_t *pud;
+	int ret;
 
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
@@ -3615,15 +3616,16 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-		int ret = create_huge_pmd(&vmf);
+		vmf.flags |= FAULT_FLAG_SIZE_PMD;
+		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
 	} else {
 		pmd_t orig_pmd = *vmf.pmd;
-		int ret;
 
 		barrier();
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
+			vmf.flags |= FAULT_FLAG_SIZE_PMD;
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
-- 
cgit v1.2.3


From d2e70d1a9b0558868823968bd2cc3c3075f3bf86 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Feb 2017 10:27:20 +1100
Subject: mm, dax: clear PMD or PUD size flags when in fall through path

Ross reported that:
Running xfstests generic/030 with XFS + DAX gives me the following kernel BUG,
which I bisected to this commit: mm,fs,dax: Change ->pmd_fault to ->huge_fault

[  370.086205] ------------[ cut here ]------------
[  370.087182] kernel BUG at arch/x86/mm/fault.c:1038!
[  370.088336] invalid opcode: 0000 [#3] PREEMPT SMP
[  370.089073] Modules linked in: dax_pmem nd_pmem dax nd_btt nd_e820 libnvdimm
[  370.090212] CPU: 0 PID: 12415 Comm: xfs_io Tainted: G      D         4.10.0-rc5-mm1-00202-g7e90fc0 #10
[  370.091648] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.1-1.fc24 04/01/2014
[  370.092946] task: ffff8800ac4f8000 task.stack: ffffc9001148c000
[  370.093769] RIP: 0010:mm_fault_error+0x15e/0x190
[  370.094410] RSP: 0000:ffffc9001148fe60 EFLAGS: 00010246
[  370.095135] RAX: 0000000000000000 RBX: 0000000000000006 RCX: ffff8800ac4f8000
[  370.096107] RDX: 00007f111c8e6400 RSI: 0000000000000006 RDI: ffffc9001148ff58
[  370.097087] RBP: ffffc9001148fe88 R08: 0000000000000000 R09: ffff880510bd3300
[  370.098072] R10: ffff8800ac4f8000 R11: 0000000000000000 R12: 00007f111c8e6400
[  370.099057] R13: 00007f111c8e6400 R14: ffff880510bd3300 R15: 0000000000000055
[  370.100135] FS:  00007f111d95e700(0000) GS:ffff880514800000(0000) knlGS:0000000000000000
[  370.101238] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  370.102021] CR2: 00007f111c8e6400 CR3: 00000000add00000 CR4: 00000000001406f0
[  370.103189] Call Trace:
[  370.103537]  __do_page_fault+0x54e/0x590
[  370.104090]  trace_do_page_fault+0x58/0x2c0
[  370.104675]  do_async_page_fault+0x2c/0x90
[  370.105342]  async_page_fault+0x28/0x30
[  370.106044] RIP: 0033:0x405e9a
[  370.106470] RSP: 002b:00007fffb7f30590 EFLAGS: 00010287
[  370.107185] RAX: 00000000004e6400 RBX: 0000000000000057 RCX: 00000000004e7000
[  370.108155] RDX: 00007f111c400000 RSI: 00000000004e7000 RDI: 0000000001c35080
[  370.109157] RBP: 00000000004e6400 R08: 0000000000000014 R09: 1999999999999999
[  370.110158] R10: 00007f111d2dc200 R11: 0000000000000000 R12: 0000000001c32fc0
[  370.111165] R13: 0000000000000000 R14: 0000000000000c00 R15: 0000000000000005
[  370.112171] Code: 07 00 00 00 e8 a4 ee ff ff e9 11 ff ff ff 4c 89 ea 48 89 de 45 31 c0 31 c9 e8 8f f7 ff ff 48 83 c4 08 5b 41 5c 41 5d 41 5e 5d c3 <0f> 0b 41 8b 94 24 80 04 00 00 49 8d b4 24 b0 06 00 00 4c 89 e9
[  370.114823] RIP: mm_fault_error+0x15e/0x190 RSP: ffffc9001148fe60
[  370.115722] ---[ end trace 2ce10d930638254d ]---

It appears that there are 2 issues. First, the size bits used for vm_fault
needs to be shifted over. Otherwise, FAULT_FLAG_SIZE_PMD is clobbering
FAULT_FLAG_INSTRUCTION. Second issue, after create_huge_pmd() is being
called and is falling back to the pte fault handler, the FAULT_FLAG_SIZE_PMD
flag remains and that causes the dax fault handler to go towards the pmd
fault handler instead of the pte fault handler. Fixes are made for the pud
and pmd fall through paths.

Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 mm/memory.c        | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e16c6f7d9c33..ab45a55c809b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -281,10 +281,10 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
-#define FAULT_FLAG_SIZE_MASK	0x700	/* Support up to 8-level page tables */
-#define FAULT_FLAG_SIZE_PTE	0x000	/* First level (eg 4k) */
-#define FAULT_FLAG_SIZE_PMD	0x100	/* Second level (eg 2MB) */
-#define FAULT_FLAG_SIZE_PUD	0x200	/* Third level (eg 1GB) */
+#define FAULT_FLAG_SIZE_MASK	0x7000	/* Support up to 8-level page tables */
+#define FAULT_FLAG_SIZE_PTE	0x0000	/* First level (eg 4k) */
+#define FAULT_FLAG_SIZE_PMD	0x1000	/* Second level (eg 2MB) */
+#define FAULT_FLAG_SIZE_PUD	0x2000	/* Third level (eg 1GB) */
 
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
diff --git a/mm/memory.c b/mm/memory.c
index baef119c3278..e721e8eba570 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3620,6 +3620,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
+		/* fall through path, remove PMD flag */
+		vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
 	} else {
 		pmd_t orig_pmd = *vmf.pmd;
 
@@ -3634,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 				ret = wp_huge_pmd(&vmf, orig_pmd);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
+				/* fall through path, remove PUD flag */
+				vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
 			} else {
 				huge_pmd_set_accessed(&vmf, orig_pmd);
 				return 0;
-- 
cgit v1.2.3


From 8987c0b27fc53fe2734948d77c021e418aa49c3c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Wed, 8 Feb 2017 10:27:21 +1100
Subject: mm, x86: add support for PUD-sized transparent hugepages

The current transparent hugepage code only supports PMDs.  This patch adds
support for transparent use of PUDs with DAX.  It does not include support
for anonymous pages.  x86 support code also added.

Most of this patch simply parallels the work that was done for huge PMDs.
The only major difference is how the new ->pud_entry method in mm_walk
works.  The ->pmd_entry method replaces the ->pte_entry method, whereas
the ->pud_entry method works along with either ->pmd_entry or ->pte_entry.
The pagewalk code takes care of locking the PUD before calling
->pud_walk, so handlers do not need to worry whether the PUD is stable.

Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/Kconfig                          |   3 +
 arch/x86/Kconfig                      |   1 +
 arch/x86/include/asm/paravirt.h       |  11 ++
 arch/x86/include/asm/paravirt_types.h |   2 +
 arch/x86/include/asm/pgtable-2level.h |  17 +++
 arch/x86/include/asm/pgtable-3level.h |  24 ++++
 arch/x86/include/asm/pgtable.h        | 140 +++++++++++++++++++
 arch/x86/include/asm/pgtable_64.h     |  15 ++
 arch/x86/kernel/paravirt.c            |   1 +
 arch/x86/mm/pgtable.c                 |  31 +++++
 include/asm-generic/pgtable.h         |  80 ++++++++++-
 include/asm-generic/tlb.h             |  14 ++
 include/linux/huge_mm.h               |  83 +++++++++++-
 include/linux/mm.h                    |  30 +++-
 include/linux/mmu_notifier.h          |  14 ++
 include/linux/pfn_t.h                 |  12 ++
 mm/gup.c                              |   7 +
 mm/huge_memory.c                      | 249 ++++++++++++++++++++++++++++++++++
 mm/memory.c                           |  88 +++++++++++-
 mm/pagewalk.c                         |  20 ++-
 mm/pgtable-generic.c                  |  14 ++
 21 files changed, 838 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 99839c23d453..36d0f9e9f17f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -545,6 +545,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	bool
 
+config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	bool
+
 config HAVE_ARCH_HUGE_VMAP
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e487493bbd47..2c4a6abf2c88 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 	select HAVE_ARCH_VMAP_STACK		if X86_64
 	select HAVE_ARCH_WITHIN_STACK_FRAMES
 	select HAVE_CC_STACKPROTECTOR
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1eea6ca40694..1b5c1725c630 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -475,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			    native_pmd_val(pmd));
 }
 
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+			      pud_t *pudp, pud_t pud)
+{
+	if (sizeof(pudval_t) > sizeof(long))
+		/* 5 arg words */
+		pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
+	else
+		PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
+			    native_pud_val(pud));
+}
+
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
 	pmdval_t val = native_pmd_val(pmd);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index bb2de45a60f2..b060f962d581 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,6 +249,8 @@ struct pv_mmu_ops {
 	void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
 	void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
 			   pmd_t *pmdp, pmd_t pmdval);
+	void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
+			   pud_t *pudp, pud_t pudval);
 	void (*pte_update)(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep);
 
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index fd74a11959de..a8b96e708c2b 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
 	*pmdp = pmd;
 }
 
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+}
+
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
 	native_set_pte(ptep, pte);
@@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp)
 	native_set_pmd(pmdp, __pmd(0));
 }
 
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+
 static inline void native_pte_clear(struct mm_struct *mm,
 				    unsigned long addr, pte_t *xp)
 {
@@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+	return __pud(xchg((pudval_t *)xp, 0));
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
 /* Bit manipulation helper on pte/pgoff entry */
 static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
 				      unsigned long mask, unsigned int leftshift)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index cdaa58c9b39e..be759ff13d02 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -176,6 +176,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+union split_pud {
+	struct {
+		u32 pud_low;
+		u32 pud_high;
+	};
+	pud_t pud;
+};
+
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+	union split_pud res, *orig = (union split_pud *)pudp;
+
+	/* xchg acts as a barrier before setting of the high bits */
+	res.pud_low = xchg(&orig->pud_low, 0);
+	res.pud_high = orig->pud_high;
+	orig->pud_high = 0;
+
+	return res.pud;
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb436efa..1cfb36b8c024 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #define set_pte(ptep, pte)		native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)	native_set_pte_at(mm, addr, ptep, pte)
 #define set_pmd_at(mm, addr, pmdp, pmd)	native_set_pmd_at(mm, addr, pmdp, pmd)
+#define set_pud_at(mm, addr, pudp, pud)	native_set_pud_at(mm, addr, pudp, pud)
 
 #define set_pte_atomic(ptep, pte)					\
 	native_set_pte_atomic(ptep, pte)
@@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_ACCESSED;
 }
 
+static inline int pud_dirty(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_DIRTY;
+}
+
+static inline int pud_young(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_ACCESSED;
+}
+
 static inline int pte_write(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_RW;
@@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd)
 	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_trans_huge(pud_t pud)
+{
+	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+}
+#endif
+
 #define has_transparent_hugepage has_transparent_hugepage
 static inline int has_transparent_hugepage(void)
 {
@@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_devmap(pud_t pud)
+{
+	return !!(pud_val(pud) & _PAGE_DEVMAP);
+}
+#else
+static inline int pud_devmap(pud_t pud)
+{
+	return 0;
+}
+#endif
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
+static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
+{
+	pudval_t v = native_pud_val(pud);
+
+	return __pud(v | set);
+}
+
+static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
+{
+	pudval_t v = native_pud_val(pud);
+
+	return __pud(v & ~clear);
+}
+
+static inline pud_t pud_mkold(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_DIRTY);
+}
+
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_DEVMAP);
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_PSE);
+}
+
+static inline pud_t pud_mkyoung(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mknotpresent(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline int pte_soft_dirty(pte_t pte)
 {
@@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
 }
 
+static inline int pud_soft_dirty(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_SOFT_DIRTY;
+}
+
 static inline pte_t pte_mksoft_dirty(pte_t pte)
 {
 	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
@@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pud_t pud_mksoft_dirty(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
 static inline pte_t pte_clear_soft_dirty(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
 	return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pud_t pud_clear_soft_dirty(pud_t pud)
+{
+	return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
@@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 		     massage_pgprot(pgprot));
 }
 
+static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
+}
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	pteval_t val = pte_val(pte);
@@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
 	return res;
 }
 
+static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
+{
+	pud_t res = *pudp;
+
+	native_pud_clear(pudp);
+	return res;
+}
+
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
 				     pte_t *ptep , pte_t pte)
 {
@@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	native_set_pmd(pmdp, pmd);
 }
 
+static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
+				     pud_t *pudp, pud_t pud)
+{
+	native_set_pud(pudp, pud);
+}
+
 #ifndef CONFIG_PARAVIRT
 /*
  * Rules for using pte_update - it must be called after any PTE update which
@@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp,
 				 pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pud_t *pudp,
+				 pud_t entry, int dirty);
 
 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 				     unsigned long addr, pmd_t *pmdp);
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pud_t *pudp);
 
 #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
@@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long
 	return native_pmdp_get_and_clear(pmdp);
 }
 
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+					unsigned long addr, pud_t *pudp)
+{
+	return native_pudp_get_and_clear(pudp);
+}
+
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long addr, pmd_t *pmdp)
@@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmd)
 {
 }
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+		unsigned long addr, pud_t *pud)
+{
+}
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 62b775926045..73c7ccc38912 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+#ifdef CONFIG_SMP
+	return native_make_pud(xchg(&xp->pud, 0));
+#else
+	/* native_local_pudp_get_and_clear,
+	 * but duplicated because of cyclic dependency
+	 */
+	pud_t ret = *xp;
+
+	native_pud_clear(xp);
+	return ret;
+#endif
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 	*pgdp = pgd;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index a1bfba0f7234..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -425,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 	.pmd_clear = native_pmd_clear,
 #endif
 	.set_pud = native_set_pud,
+	.set_pud_at = native_set_pud_at,
 
 	.pmd_val = PTE_IDENT,
 	.make_pmd = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5af4e67..6cbdff26bb96 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 	return changed;
 }
+
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+			  pud_t *pudp, pud_t entry, int dirty)
+{
+	int changed = !pud_same(*pudp, entry);
+
+	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+
+	if (changed && dirty) {
+		*pudp = entry;
+		/*
+		 * We had a write-protection fault here and changed the pud
+		 * to to more permissive. No need to flush the TLB for that,
+		 * #PF is architecturally guaranteed to do that and in the
+		 * worst-case we'll generate a spurious fault.
+		 */
+	}
+
+	return changed;
+}
 #endif
 
 int ptep_test_and_clear_young(struct vm_area_struct *vma,
@@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
 	return ret;
 }
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pud_t *pudp)
+{
+	int ret = 0;
+
+	if (pud_young(*pudp))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *)pudp);
+
+	return ret;
+}
 #endif
 
 int ptep_clear_flush_young(struct vm_area_struct *vma,
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 18af2bcefe6a..a0aba0f9c57b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -36,6 +36,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
 extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp,
 				 pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pud_t *pudp,
+				 pud_t entry, int dirty);
 #else
 static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp,
@@ -44,6 +47,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 	BUILD_BUG();
 	return 0;
 }
+static inline int pudp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long address, pud_t *pudp,
+					pud_t entry, int dirty)
+{
+	BUILD_BUG();
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
@@ -121,8 +131,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address,
 					    pmd_t *pmdp)
@@ -131,20 +141,40 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 	pmd_clear(pmdp);
 	return pmd;
 }
+#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
+#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address,
+					    pud_t *pudp)
+{
+	pud_t pud = *pudp;
+
+	pud_clear(pudp);
+	return pud;
+}
+#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#endif
 
-#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
 					    unsigned long address, pmd_t *pmdp,
 					    int full)
 {
 	return pmdp_huge_get_and_clear(mm, address, pmdp);
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
+static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
+					    unsigned long address, pud_t *pudp,
+					    int full)
+{
+	return pudp_huge_get_and_clear(mm, address, pudp);
+}
+#endif
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 					    unsigned long address, pte_t *ptep,
@@ -181,6 +211,9 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
 extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 			      unsigned long address,
 			      pmd_t *pmdp);
+extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
+			      unsigned long address,
+			      pud_t *pudp);
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
@@ -208,6 +241,23 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
+#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline void pudp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pud_t *pudp)
+{
+	pud_t old_pud = *pudp;
+
+	set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
+}
+#else
+static inline void pudp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long address, pud_t *pudp)
+{
+	BUILD_BUG();
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#endif
 
 #ifndef pmdp_collapse_flush
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -273,12 +323,23 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
 	return pmd_val(pmd_a) == pmd_val(pmd_b);
 }
+
+static inline int pud_same(pud_t pud_a, pud_t pud_b)
+{
+	return pud_val(pud_a) == pud_val(pud_b);
+}
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
 	BUILD_BUG();
 	return 0;
 }
+
+static inline int pud_same(pud_t pud_a, pud_t pud_b)
+{
+	BUILD_BUG();
+	return 0;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
@@ -640,6 +701,15 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
+	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+static inline int pud_trans_huge(pud_t pud)
+{
+	return 0;
+}
+#endif
+
 #ifndef pmd_read_atomic
 static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 {
@@ -785,8 +855,10 @@ static inline int pmd_clear_huge(pmd_t *pmd)
  * e.g. see arch/arc: flush_pmd_tlb_range
  */
 #define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
+#define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
 #else
 #define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
+#define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
 #endif
 #endif
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7eed8cf3130a..4329bc6ef04b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -232,6 +232,20 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);		\
 	} while (0)
 
+/**
+ * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
+ * invalidation. This is a nop so far, because only x86 needs it.
+ */
+#ifndef __tlb_remove_pud_tlb_entry
+#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
+#endif
+
+#define tlb_remove_pud_tlb_entry(tlb, pudp, address)			\
+	do {								\
+		__tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE);	\
+		__tlb_remove_pud_tlb_entry(tlb, pudp, address);		\
+	} while (0)
+
 /*
  * For things like page tables caches (ie caching addresses "inside" the
  * page tables, like x86 does), for legacy reasons, flushing an
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0029e786205..a3762d49ba39 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -6,6 +6,18 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 			 struct vm_area_struct *vma);
 extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
+extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+			 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+			 struct vm_area_struct *vma);
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
+#else
+static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
+{
+}
+#endif
+
 extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
 extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
@@ -17,6 +29,9 @@ extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
 extern int zap_huge_pmd(struct mmu_gather *tlb,
 			struct vm_area_struct *vma,
 			pmd_t *pmd, unsigned long addr);
+extern int zap_huge_pud(struct mmu_gather *tlb,
+			struct vm_area_struct *vma,
+			pud_t *pud, unsigned long addr);
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
@@ -26,8 +41,10 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, pgprot_t newprot,
 			int prot_numa);
-int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
-			pfn_t pfn, bool write);
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+			pmd_t *pmd, pfn_t pfn, bool write);
+int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+			pud_t *pud, pfn_t pfn, bool write);
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -58,13 +75,14 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, int flags);
-
 #define HPAGE_PMD_SHIFT PMD_SHIFT
 #define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
 #define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
 
+#define HPAGE_PUD_SHIFT PUD_SHIFT
+#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
+#define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
+
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
 #define transparent_hugepage_enabled(__vma)				\
@@ -118,6 +136,17 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 		bool freeze, struct page *page);
 
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+		unsigned long address);
+
+#define split_huge_pud(__vma, __pud, __address)				\
+	do {								\
+		pud_t *____pud = (__pud);				\
+		if (pud_trans_huge(*____pud)				\
+					|| pud_devmap(*____pud))	\
+			__split_huge_pud(__vma, __pud, __address);	\
+	}  while (0)
+
 extern int hugepage_madvise(struct vm_area_struct *vma,
 			    unsigned long *vm_flags, int advice);
 extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -126,6 +155,8 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    long adjust_next);
 extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma);
+extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
+		struct vm_area_struct *vma);
 /* mmap_sem must be held on entry */
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
@@ -136,6 +167,15 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 	else
 		return NULL;
 }
+static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
+		struct vm_area_struct *vma)
+{
+	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
+	if (pud_trans_huge(*pud) || pud_devmap(*pud))
+		return __pud_trans_huge_lock(pud, vma);
+	else
+		return NULL;
+}
 static inline int hpage_nr_pages(struct page *page)
 {
 	if (unlikely(PageTransHuge(page)))
@@ -143,6 +183,11 @@ static inline int hpage_nr_pages(struct page *page)
 	return 1;
 }
 
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, int flags);
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, int flags);
+
 extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
@@ -157,6 +202,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return is_huge_zero_page(pmd_page(pmd));
 }
 
+static inline bool is_huge_zero_pud(pud_t pud)
+{
+	return false;
+}
+
 struct page *mm_get_huge_zero_page(struct mm_struct *mm);
 void mm_put_huge_zero_page(struct mm_struct *mm);
 
@@ -167,6 +217,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
 
+#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
+#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
+
 #define hpage_nr_pages(x) 1
 
 #define transparent_hugepage_enabled(__vma) 0
@@ -195,6 +249,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 		unsigned long address, bool freeze, struct page *page) {}
 
+#define split_huge_pud(__vma, __pmd, __address)	\
+	do { } while (0)
+
 static inline int hugepage_madvise(struct vm_area_struct *vma,
 				   unsigned long *vm_flags, int advice)
 {
@@ -212,6 +269,11 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 {
 	return NULL;
 }
+static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
+		struct vm_area_struct *vma)
+{
+	return NULL;
+}
 
 static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
@@ -223,6 +285,11 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
+static inline bool is_huge_zero_pud(pud_t pud)
+{
+	return false;
+}
+
 static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
 	return;
@@ -233,6 +300,12 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
 {
 	return NULL;
 }
+
+static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
+		unsigned long addr, pud_t *pud, int flags)
+{
+	return NULL;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab45a55c809b..c3e35474e7ff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -420,6 +420,10 @@ static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
 }
+static inline int pud_devmap(pud_t pud)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -1195,6 +1199,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 
 /**
  * mm_walk - callbacks for walk_page_range
+ * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
+ *	       this handler should only handle pud_trans_huge() puds.
+ *	       the pmd_entry or pte_entry callbacks will be used for
+ *	       regular PUDs.
  * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
  *	       this handler is required to be able to handle
  *	       pmd_trans_huge() pmds.  They may simply choose to
@@ -1214,6 +1222,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * (see the comment on walk_page_range() for more details)
  */
 struct mm_walk {
+	int (*pud_entry)(pud_t *pud, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1797,8 +1807,26 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
 	return ptl;
 }
 
-extern void __init pagecache_init(void);
+/*
+ * No scalability reason to split PUD locks yet, but follow the same pattern
+ * as the PMD locks to make it easier if we decide to.  The VM should not be
+ * considered ready to switch to split PUD locks yet; there may be places
+ * which need to be converted from page_table_lock.
+ */
+static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
+{
+	return &mm->page_table_lock;
+}
+
+static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
+{
+	spinlock_t *ptl = pud_lockptr(mm, pud);
+
+	spin_lock(ptl);
+	return ptl;
+}
 
+extern void __init pagecache_init(void);
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index a1a210d59961..51891fb0d3ce 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -381,6 +381,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pmd;								\
 })
 
+#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud)		\
+({									\
+	unsigned long ___haddr = __haddr & HPAGE_PUD_MASK;		\
+	struct mm_struct *___mm = (__vma)->vm_mm;			\
+	pud_t ___pud;							\
+									\
+	___pud = pudp_huge_clear_flush(__vma, __haddr, __pud);		\
+	mmu_notifier_invalidate_range(___mm, ___haddr,			\
+				      ___haddr + HPAGE_PUD_SIZE);	\
+									\
+	___pud;								\
+})
+
 #define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
@@ -475,6 +488,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
+#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
 #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 033fc7bbcefa..a49b3259cad7 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -90,6 +90,13 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
 {
 	return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
+{
+	return pfn_pud(pfn_t_to_pfn(pfn), pgprot);
+}
+#endif
 #endif
 
 #ifdef __HAVE_ARCH_PTE_DEVMAP
@@ -106,5 +113,10 @@ static inline bool pfn_t_devmap(pfn_t pfn)
 }
 pte_t pte_mkdevmap(pte_t pte);
 pmd_t pmd_mkdevmap(pmd_t pmd);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+pud_t pud_mkdevmap(pud_t pud);
 #endif
+#endif /* __HAVE_ARCH_PTE_DEVMAP */
+
 #endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/gup.c b/mm/gup.c
index 40abe4c90383..1e67461b2733 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -253,6 +253,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+	if (pud_devmap(*pud)) {
+		ptl = pud_lock(mm, pud);
+		page = follow_devmap_pud(vma, address, pud, flags);
+		spin_unlock(ptl);
+		if (page)
+			return page;
+	}
 	if (unlikely(pud_bad(*pud)))
 		return no_page_table(vma, flags);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9ecc2aeadfc..85742ac5b32e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_flags & VM_WRITE))
+		pud = pud_mkwrite(pud);
+	return pud;
+}
+
+static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t entry;
+	spinlock_t *ptl;
+
+	ptl = pud_lock(mm, pud);
+	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
+	if (pfn_t_devmap(pfn))
+		entry = pud_mkdevmap(entry);
+	if (write) {
+		entry = pud_mkyoung(pud_mkdirty(entry));
+		entry = maybe_pud_mkwrite(entry, vma);
+	}
+	set_pud_at(mm, addr, pud, entry);
+	update_mmu_cache_pud(vma, addr, pud);
+	spin_unlock(ptl);
+}
+
+int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+			pud_t *pud, pfn_t pfn, bool write)
+{
+	pgprot_t pgprot = vma->vm_page_prot;
+	/*
+	 * If we had pud_special, we could avoid all these restrictions,
+	 * but we need to be consistent with PTEs and architectures that
+	 * can't support a 'special' bit.
+	 */
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+						(VM_PFNMAP|VM_MIXEDMAP));
+	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+	BUG_ON(!pfn_t_devmap(pfn));
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	track_pfn_insert(vma, &pgprot, pfn);
+
+	insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd)
 {
@@ -887,6 +941,123 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud)
+{
+	pud_t _pud;
+
+	/*
+	 * We should set the dirty bit only for FOLL_WRITE but for now
+	 * the dirty bit in the pud is meaningless.  And if the dirty
+	 * bit will become meaningful and we'll only set it with
+	 * FOLL_WRITE, an atomic set_bit will be required on the pud to
+	 * set the young bit, instead of the current set_pud_at.
+	 */
+	_pud = pud_mkyoung(pud_mkdirty(*pud));
+	if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
+				pud, _pud,  1))
+		update_mmu_cache_pud(vma, addr, pud);
+}
+
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, int flags)
+{
+	unsigned long pfn = pud_pfn(*pud);
+	struct mm_struct *mm = vma->vm_mm;
+	struct dev_pagemap *pgmap;
+	struct page *page;
+
+	assert_spin_locked(pud_lockptr(mm, pud));
+
+	if (flags & FOLL_WRITE && !pud_write(*pud))
+		return NULL;
+
+	if (pud_present(*pud) && pud_devmap(*pud))
+		/* pass */;
+	else
+		return NULL;
+
+	if (flags & FOLL_TOUCH)
+		touch_pud(vma, addr, pud);
+
+	/*
+	 * device mapped pages can only be returned if the
+	 * caller will manage the page reference count.
+	 */
+	if (!(flags & FOLL_GET))
+		return ERR_PTR(-EEXIST);
+
+	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+	pgmap = get_dev_pagemap(pfn, NULL);
+	if (!pgmap)
+		return ERR_PTR(-EFAULT);
+	page = pfn_to_page(pfn);
+	get_page(page);
+	put_dev_pagemap(pgmap);
+
+	return page;
+}
+
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+		  struct vm_area_struct *vma)
+{
+	spinlock_t *dst_ptl, *src_ptl;
+	pud_t pud;
+	int ret;
+
+	dst_ptl = pud_lock(dst_mm, dst_pud);
+	src_ptl = pud_lockptr(src_mm, src_pud);
+	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+	ret = -EAGAIN;
+	pud = *src_pud;
+	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+		goto out_unlock;
+
+	/*
+	 * When page table lock is held, the huge zero pud should not be
+	 * under splitting since we don't split the page itself, only pud to
+	 * a page table.
+	 */
+	if (is_huge_zero_pud(pud)) {
+		/* No huge zero pud yet */
+	}
+
+	pudp_set_wrprotect(src_mm, addr, src_pud);
+	pud = pud_mkold(pud_wrprotect(pud));
+	set_pud_at(dst_mm, addr, dst_pud, pud);
+
+	ret = 0;
+out_unlock:
+	spin_unlock(src_ptl);
+	spin_unlock(dst_ptl);
+	return ret;
+}
+
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
+{
+	pud_t entry;
+	unsigned long haddr;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+	vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
+	if (unlikely(!pud_same(*vmf->pud, orig_pud)))
+		goto unlock;
+
+	entry = pud_mkyoung(orig_pud);
+	if (write)
+		entry = pud_mkdirty(entry);
+	haddr = vmf->address & HPAGE_PUD_MASK;
+	if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
+		update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
+
+unlock:
+	spin_unlock(vmf->ptl);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
 void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	pmd_t entry;
@@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 	return NULL;
 }
 
+/*
+ * Returns true if a given pud maps a thp, false otherwise.
+ *
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
+ */
+spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+	spinlock_t *ptl;
+
+	ptl = pud_lock(vma->vm_mm, pud);
+	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+		return ptl;
+	spin_unlock(ptl);
+	return NULL;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		 pud_t *pud, unsigned long addr)
+{
+	pud_t orig_pud;
+	spinlock_t *ptl;
+
+	ptl = __pud_trans_huge_lock(pud, vma);
+	if (!ptl)
+		return 0;
+	/*
+	 * For architectures like ppc64 we look at deposited pgtable
+	 * when calling pudp_huge_get_and_clear. So do the
+	 * pgtable_trans_huge_withdraw after finishing pudp related
+	 * operations.
+	 */
+	orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
+			tlb->fullmm);
+	tlb_remove_pud_tlb_entry(tlb, pud, addr);
+	if (vma_is_dax(vma)) {
+		spin_unlock(ptl);
+		/* No zero page support yet */
+	} else {
+		/* No support for anonymous PUD pages yet */
+		BUG();
+	}
+	return 1;
+}
+
+static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
+		unsigned long haddr)
+{
+	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
+	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
+	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+
+	count_vm_event(THP_SPLIT_PMD);
+
+	pudp_huge_clear_flush_notify(vma, haddr, pud);
+}
+
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+		unsigned long address)
+{
+	spinlock_t *ptl;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long haddr = address & HPAGE_PUD_MASK;
+
+	mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
+	ptl = pud_lock(mm, pud);
+	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+		goto out;
+	__split_huge_pud_locked(vma, pud, haddr);
+
+out:
+	spin_unlock(ptl);
+	mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
 static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 		unsigned long haddr, pmd_t *pmd)
 {
diff --git a/mm/memory.c b/mm/memory.c
index e721e8eba570..41e2a2d4b2a6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 		next = pmd_addr_end(addr, end);
 		if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
 			int err;
-			VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
 			err = copy_huge_pmd(dst_mm, src_mm,
 					    dst_pmd, src_pmd, addr, vma);
 			if (err == -ENOMEM)
@@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
 	src_pud = pud_offset(src_pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+			int err;
+
+			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+			err = copy_huge_pud(dst_mm, src_mm,
+					    dst_pud, src_pud, addr, vma);
+			if (err == -ENOMEM)
+				return -ENOMEM;
+			if (!err)
+				continue;
+			/* fall through */
+		}
 		if (pud_none_or_clear_bad(src_pud))
 			continue;
 		if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
@@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+			if (next - addr != HPAGE_PUD_SIZE) {
+				VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+				split_huge_pud(vma, pud, addr);
+			} else if (zap_huge_pud(tlb, vma, pud, addr))
+				goto next;
+			/* fall through */
+		}
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+next:
+		cond_resched();
 	} while (pud++, addr = next, addr != end);
 
 	return addr;
@@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
 }
 
+static int create_huge_pud(struct vm_fault *vmf)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/* No support for anonymous transparent PUD pages yet */
+	if (vma_is_anonymous(vmf->vma))
+		return VM_FAULT_FALLBACK;
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+	return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	/* No support for anonymous transparent PUD pages yet */
+	if (vma_is_anonymous(vmf->vma))
+		return VM_FAULT_FALLBACK;
+	if (vmf->vma->vm_ops->huge_fault)
+		return vmf->vma->vm_ops->huge_fault(vmf);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+	return VM_FAULT_FALLBACK;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	};
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
-	pud_t *pud;
 	int ret;
 
 	pgd = pgd_offset(mm, address);
-	pud = pud_alloc(mm, pgd, address);
-	if (!pud)
+
+	vmf.pud = pud_alloc(mm, pgd, address);
+	if (!vmf.pud)
 		return VM_FAULT_OOM;
-	vmf.pmd = pmd_alloc(mm, pud, address);
+	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+		vmf.flags |= FAULT_FLAG_SIZE_PUD;
+		ret = create_huge_pud(&vmf);
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+	} else {
+		pud_t orig_pud = *vmf.pud;
+
+		barrier();
+		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+			unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+			vmf.flags |= FAULT_FLAG_SIZE_PUD;
+
+			/* NUMA case for anonymous PUDs would go here */
+
+			if (dirty && !pud_write(orig_pud)) {
+				ret = wp_huge_pud(&vmf, orig_pud);
+				if (!(ret & VM_FAULT_FALLBACK))
+					return ret;
+			} else {
+				huge_pud_set_accessed(&vmf, orig_pud);
+				return 0;
+			}
+		}
+	}
+
+	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
@@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
+	spinlock_t *ptl;
 	pmd_t *new = pmd_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 
 	smp_wmb(); /* See comment in __pte_alloc */
 
-	spin_lock(&mm->page_table_lock);
+	ptl = pud_lock(mm, pud);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (!pud_present(*pud)) {
 		mm_inc_nr_pmds(mm);
@@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	} else /* Another has populated it */
 		pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 207244489a68..03761577ae86 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -78,14 +78,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 
 	pud = pud_offset(pgd, addr);
 	do {
+ again:
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud)) {
+		if (pud_none(*pud) || !walk->vma) {
 			if (walk->pte_hole)
 				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
+
+		if (walk->pud_entry) {
+			spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
+
+			if (ptl) {
+				err = walk->pud_entry(pud, addr, next, walk);
+				spin_unlock(ptl);
+				if (err)
+					break;
+				continue;
+			}
+		}
+
+		split_huge_pud(walk->vma, pud, addr);
+		if (pud_none(*pud))
+			goto again;
+
 		if (walk->pmd_entry || walk->pte_entry)
 			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 71c5f9109f2a..4ed5908c65b0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -123,6 +123,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
 }
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+			    pud_t *pudp)
+{
+	pud_t pud;
+
+	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+	VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+	pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
+	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+	return pud;
+}
+#endif
 #endif
 
 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
-- 
cgit v1.2.3


From f0337695656599776bf1a4737dfb96ca8dcbcdc2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Wed, 8 Feb 2017 10:27:23 +1100
Subject: mm: replace FAULT_FLAG_SIZE with parameter to huge_fault

Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has
been somewhat painful with getting the flags set and removed at the
correct locations.  More than one kernel oops was introduced due to
difficulties of getting the placement correctly.

Remove the flag values and introduce an input parameter to huge_fault that
indicates the size of the page entry.  This makes the code easier to trace
and should avoid the issues we see with the fault flags where removal of
the flag was necessary in the fallback paths.

Link: http://lkml.kernel.org/r/148615748258.43180.1690152053774975329.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/dax.c   | 18 ++++++++++++------
 fs/dax.c            |  9 +++++----
 fs/ext2/file.c      |  2 +-
 fs/ext4/file.c      | 12 +++++++++---
 fs/xfs/xfs_file.c   |  9 +++++----
 include/linux/dax.h |  3 ++-
 include/linux/mm.h  | 14 ++++++++------
 mm/memory.c         | 17 ++++-------------
 8 files changed, 46 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index b90bb301bda0..b75c77254fdb 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -538,7 +538,8 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
 }
 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-static int dax_dev_fault(struct vm_fault *vmf)
+static int dax_dev_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
 {
 	int rc;
 	struct file *filp = vmf->vma->vm_file;
@@ -550,14 +551,14 @@ static int dax_dev_fault(struct vm_fault *vmf)
 			vmf->vma->vm_start, vmf->vma->vm_end);
 
 	rcu_read_lock();
-	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
-	case FAULT_FLAG_SIZE_PTE:
+	switch (pe_size) {
+	case PE_SIZE_PTE:
 		rc = __dax_dev_pte_fault(dax_dev, vmf);
 		break;
-	case FAULT_FLAG_SIZE_PMD:
+	case PE_SIZE_PMD:
 		rc = __dax_dev_pmd_fault(dax_dev, vmf);
 		break;
-	case FAULT_FLAG_SIZE_PUD:
+	case PE_SIZE_PUD:
 		rc = __dax_dev_pud_fault(dax_dev, vmf);
 		break;
 	default:
@@ -568,9 +569,14 @@ static int dax_dev_fault(struct vm_fault *vmf)
 	return rc;
 }
 
+static int dax_dev_fault(struct vm_fault *vmf)
+{
+	return dax_dev_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 static const struct vm_operations_struct dax_dev_vm_ops = {
 	.fault = dax_dev_fault,
-	.huge_fault = dax_dev_fault,
+	.huge_fault = dax_dev_huge_fault,
 };
 
 static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/dax.c b/fs/dax.c
index 25f791d0afee..97b8ecbed1c4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1446,12 +1446,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops)
  * has done all the necessary locking for page fault to proceed
  * successfully.
  */
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops)
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+		struct iomap_ops *ops)
 {
-	switch (vmf->flags & FAULT_FLAG_SIZE_MASK) {
-	case FAULT_FLAG_SIZE_PTE:
+	switch (pe_size) {
+	case PE_SIZE_PTE:
 		return dax_iomap_pte_fault(vmf, ops);
-	case FAULT_FLAG_SIZE_PMD:
+	case PE_SIZE_PMD:
 		return dax_iomap_pmd_fault(vmf, ops);
 	default:
 		return VM_FAULT_FALLBACK;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 68738832beda..b21891a6bfca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = dax_iomap_fault(vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 51d71559cd1d..e8ab46efc4f9 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -255,7 +255,8 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
-static int ext4_dax_fault(struct vm_fault *vmf)
+static int ext4_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
 {
 	int result;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -267,7 +268,7 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 		file_update_time(vmf->vma->vm_file);
 	}
 	down_read(&EXT4_I(inode)->i_mmap_sem);
-	result = dax_iomap_fault(vmf, &ext4_iomap_ops);
+	result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
 	up_read(&EXT4_I(inode)->i_mmap_sem);
 	if (write)
 		sb_end_pagefault(sb);
@@ -275,6 +276,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 	return result;
 }
 
+static int ext4_dax_fault(struct vm_fault *vmf)
+{
+	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
 /*
  * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
  * handler we check for races agaist truncate. Note that since we cycle through
@@ -307,7 +313,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
-	.huge_fault	= ext4_dax_fault,
+	.huge_fault	= ext4_dax_huge_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 08accbe84ef1..9d7533abf64a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1385,7 +1385,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	} else {
 		ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1412,7 +1412,7 @@ xfs_filemap_fault(
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	else
 		ret = filemap_fault(vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1429,7 +1429,8 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_huge_fault(
-	struct vm_fault		*vmf)
+	struct vm_fault		*vmf,
+	enum page_entry_size	pe_size)
 {
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
@@ -1446,7 +1447,7 @@ xfs_filemap_huge_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_fault(vmf, &xfs_iomap_ops);
+	ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a3bfa264f0ad..df63730e86d2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,7 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_fault *vmf, struct iomap_ops *ops);
+int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+		struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c3e35474e7ff..7278475c489d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -281,11 +281,6 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
 
-#define FAULT_FLAG_SIZE_MASK	0x7000	/* Support up to 8-level page tables */
-#define FAULT_FLAG_SIZE_PTE	0x0000	/* First level (eg 4k) */
-#define FAULT_FLAG_SIZE_PMD	0x1000	/* Second level (eg 2MB) */
-#define FAULT_FLAG_SIZE_PUD	0x2000	/* Third level (eg 1GB) */
-
 #define FAULT_FLAG_TRACE \
 	{ FAULT_FLAG_WRITE,		"WRITE" }, \
 	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
@@ -345,6 +340,13 @@ struct vm_fault {
 					 */
 };
 
+/* page entry size for vm->huge_fault() */
+enum page_entry_size {
+	PE_SIZE_PTE = 0,
+	PE_SIZE_PMD,
+	PE_SIZE_PUD,
+};
+
 /*
  * These are the virtual MM functions - opening of an area, closing and
  * unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -355,7 +357,7 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_fault *vmf);
-	int (*huge_fault)(struct vm_fault *vmf);
+	int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size);
 	void (*map_pages)(struct vm_fault *vmf,
 			pgoff_t start_pgoff, pgoff_t end_pgoff);
 
diff --git a/mm/memory.c b/mm/memory.c
index 41e2a2d4b2a6..6040b74d02a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3489,7 +3489,7 @@ static int create_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_anonymous_page(vmf);
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 	return VM_FAULT_FALLBACK;
 }
 
@@ -3498,7 +3498,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 	if (vma_is_anonymous(vmf->vma))
 		return do_huge_pmd_wp_page(vmf, orig_pmd);
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
 
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
@@ -3519,7 +3519,7 @@ static int create_huge_pud(struct vm_fault *vmf)
 	if (vma_is_anonymous(vmf->vma))
 		return VM_FAULT_FALLBACK;
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 	return VM_FAULT_FALLBACK;
 }
@@ -3531,7 +3531,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
 	if (vma_is_anonymous(vmf->vma))
 		return VM_FAULT_FALLBACK;
 	if (vmf->vma->vm_ops->huge_fault)
-		return vmf->vma->vm_ops->huge_fault(vmf);
+		return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 	return VM_FAULT_FALLBACK;
 }
@@ -3659,7 +3659,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pud)
 		return VM_FAULT_OOM;
 	if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
-		vmf.flags |= FAULT_FLAG_SIZE_PUD;
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -3670,8 +3669,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
 			unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-			vmf.flags |= FAULT_FLAG_SIZE_PUD;
-
 			/* NUMA case for anonymous PUDs would go here */
 
 			if (dirty && !pud_write(orig_pud)) {
@@ -3689,18 +3686,14 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!vmf.pmd)
 		return VM_FAULT_OOM;
 	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
-		vmf.flags |= FAULT_FLAG_SIZE_PMD;
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
-		/* fall through path, remove PMD flag */
-		vmf.flags &= ~FAULT_FLAG_SIZE_PMD;
 	} else {
 		pmd_t orig_pmd = *vmf.pmd;
 
 		barrier();
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-			vmf.flags |= FAULT_FLAG_SIZE_PMD;
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
@@ -3709,8 +3702,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 				ret = wp_huge_pmd(&vmf, orig_pmd);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
-				/* fall through path, remove PUD flag */
-				vmf.flags &= ~FAULT_FLAG_SIZE_PUD;
 			} else {
 				huge_pmd_set_accessed(&vmf, orig_pmd);
 				return 0;
-- 
cgit v1.2.3


From bba74cf635715e0c67e347eeaa4abd14eed6c00f Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Wed, 8 Feb 2017 10:27:25 +1100
Subject: mm/migration: make isolate_movable_page() return int type

Patch series "HWPOISON: soft offlining for non-lru movable page", v6.

After Minchan's commit bda807d44454 ("mm: migrate: support non-lru movable
page migration"), some type of non-lru page like zsmalloc and
virtio-balloon page also support migration.

Therefore, we can:

1) soft offlining no-lru movable pages, which means when memory
   corrected errors occur on a non-lru movable page, we can stop to use it
   by migrating data onto another page and disable the original (maybe
   half-broken) one.

2) enable memory hotplug for non-lru movable pages, i.e.  we may
   offline blocks, which include such pages, by using non-lru page
   migration.

This patchset is heavily dependent on non-lru movable page migration.

This patch (of 4):

Change the return type of isolate_movable_page() from bool to int.  It
will return 0 when isolate movable page successfully, and return -EBUSY
when it isolates failed.

There is no functional change within this patch but prepare for later
patch.

Link: http://lkml.kernel.org/r/1485867981-16037-2-git-send-email-ysxie@foxmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  2 +-
 mm/compaction.c         |  2 +-
 mm/migrate.c            | 11 +++++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ae8d475a9385..43d5deb0c4cc 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -37,7 +37,7 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
-extern bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
diff --git a/mm/compaction.c b/mm/compaction.c
index 0aa2757399ee..0fdfde016ee2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -802,7 +802,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 					locked = false;
 				}
 
-				if (isolate_movable_page(page, isolate_mode))
+				if (!isolate_movable_page(page, isolate_mode))
 					goto isolate_success;
 			}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 87f4d0f81819..bbbd170efab7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -74,8 +74,9 @@ int migrate_prep_local(void)
 	return 0;
 }
 
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+int isolate_movable_page(struct page *page, isolate_mode_t mode)
 {
+	int ret = -EBUSY;
 	struct address_space *mapping;
 
 	/*
@@ -95,8 +96,10 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
 	 * assumes anybody doesn't touch PG_lock of newly allocated page
 	 * so unconditionally grapping the lock ruins page's owner side.
 	 */
-	if (unlikely(!__PageMovable(page)))
+	if (unlikely(!__PageMovable(page))) {
+		ret = -EINVAL;
 		goto out_putpage;
+	}
 	/*
 	 * As movable pages are not isolated from LRU lists, concurrent
 	 * compaction threads can race against page migration functions
@@ -125,14 +128,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
 	__SetPageIsolated(page);
 	unlock_page(page);
 
-	return true;
+	return 0;
 
 out_no_isolated:
 	unlock_page(page);
 out_putpage:
 	put_page(page);
 out:
-	return false;
+	return ret;
 }
 
 /* It should be called on page which is PG_movable */
-- 
cgit v1.2.3


From 41067483e77ab900ddc4ea455667045de96649c0 Mon Sep 17 00:00:00 2001
From: Yisheng Xie <xieyisheng1@huawei.com>
Date: Wed, 8 Feb 2017 10:27:26 +1100
Subject: mm/migration: make isolate_movable_page always defined

Define isolate_movable_page as a static inline function when
CONFIG_MIGRATION is not enable.  It should return -EBUSY here which means
failed to isolate movable pages.

This patch do not have any functional change but prepare for later patch.

Link: http://lkml.kernel.org/r/1485867981-16037-3-git-send-email-ysxie@foxmail.com
Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Suggested-by: Michal Hocko <mhocko@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 43d5deb0c4cc..fa76b516fa47 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -56,6 +56,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new,
 		free_page_t free, unsigned long private, enum migrate_mode mode,
 		int reason)
 	{ return -ENOSYS; }
+static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
+	{ return -EBUSY; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
-- 
cgit v1.2.3


From 9493a7b4766de76d3560f963ca28e8e086918e68 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 8 Feb 2017 10:27:28 +1100
Subject: mm: introduce page_vma_mapped_walk()

Introduce a new interface to check if a page is mapped into a vma.  It
aims to address shortcomings of page_check_address{,_transhuge}.

Existing interface is not able to handle PTE-mapped THPs: it only finds
the first PTE.  The rest lefted unnoticed.

page_vma_mapped_walk() iterates over all possible mapping of the page in
the vma.

Link: http://lkml.kernel.org/r/20170129173858.45174-3-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  26 +++++++
 mm/Makefile          |   6 +-
 mm/huge_memory.c     |   9 ++-
 mm/page_vma_mapped.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 224 insertions(+), 5 deletions(-)
 create mode 100644 mm/page_vma_mapped.c

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 15321fb1df6b..b76343610653 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/rwsem.h>
 #include <linux/memcontrol.h>
+#include <linux/highmem.h>
 
 /*
  * The anon_vma heads a list of private "related" vmas, to scan if
@@ -232,6 +233,31 @@ static inline bool page_check_address_transhuge(struct page *page,
 }
 #endif
 
+/* Avoid racy checks */
+#define PVMW_SYNC		(1 << 0)
+/* Look for migarion entries rather than present PTEs */
+#define PVMW_MIGRATION		(1 << 1)
+
+struct page_vma_mapped_walk {
+	struct page *page;
+	struct vm_area_struct *vma;
+	unsigned long address;
+	pmd_t *pmd;
+	pte_t *pte;
+	spinlock_t *ptl;
+	unsigned int flags;
+};
+
+static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
+{
+	if (pvmw->pte)
+		pte_unmap(pvmw->pte);
+	if (pvmw->ptl)
+		spin_unlock(pvmw->ptl);
+}
+
+bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
+
 /*
  * Used by swapoff to help locate where page is expected in vma.
  */
diff --git a/mm/Makefile b/mm/Makefile
index 433eaf9a876e..aa0aa17cb413 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -23,8 +23,10 @@ KCOV_INSTRUMENT_vmstat.o := n
 
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
-			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o pagewalk.o pgtable-generic.o
+			   mlock.o mmap.o mprotect.o mremap.o msync.o \
+			   page_vma_mapped.o pagewalk.o pgtable-generic.o \
+			   rmap.o vmalloc.o
+
 
 ifdef CONFIG_CROSS_MEMORY_ATTACH
 mmu-$(CONFIG_MMU)	+= process_vm_access.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 85742ac5b32e..a7bac4f2b78a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2129,9 +2129,12 @@ static void freeze_page(struct page *page)
 static void unfreeze_page(struct page *page)
 {
 	int i;
-
-	for (i = 0; i < HPAGE_PMD_NR; i++)
-		remove_migration_ptes(page + i, page + i, true);
+	if (PageTransHuge(page)) {
+		remove_migration_ptes(page, page, true);
+	} else {
+		for (i = 0; i < HPAGE_PMD_NR; i++)
+			remove_migration_ptes(page + i, page + i, true);
+	}
 }
 
 static void __split_huge_page_tail(struct page *head, int tail,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
new file mode 100644
index 000000000000..dc1a54826cf2
--- /dev/null
+++ b/mm/page_vma_mapped.c
@@ -0,0 +1,188 @@
+#include <linux/mm.h>
+#include <linux/rmap.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+static inline bool check_pmd(struct page_vma_mapped_walk *pvmw)
+{
+	pmd_t pmde;
+	/*
+	 * Make sure we don't re-load pmd between present and !trans_huge check.
+	 * We need a consistent view.
+	 */
+	pmde = READ_ONCE(*pvmw->pmd);
+	return pmd_present(pmde) && !pmd_trans_huge(pmde);
+}
+
+static inline bool not_found(struct page_vma_mapped_walk *pvmw)
+{
+	page_vma_mapped_walk_done(pvmw);
+	return false;
+}
+
+static bool map_pte(struct page_vma_mapped_walk *pvmw)
+{
+	pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
+	if (!(pvmw->flags & PVMW_SYNC)) {
+		if (pvmw->flags & PVMW_MIGRATION) {
+			if (!is_swap_pte(*pvmw->pte))
+				return false;
+		} else {
+			if (!pte_present(*pvmw->pte))
+				return false;
+		}
+	}
+	pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+	spin_lock(pvmw->ptl);
+	return true;
+}
+
+static bool check_pte(struct page_vma_mapped_walk *pvmw)
+{
+	if (pvmw->flags & PVMW_MIGRATION) {
+#ifdef CONFIG_MIGRATION
+		swp_entry_t entry;
+		if (!is_swap_pte(*pvmw->pte))
+			return false;
+		entry = pte_to_swp_entry(*pvmw->pte);
+		if (!is_migration_entry(entry))
+			return false;
+		if (migration_entry_to_page(entry) - pvmw->page >=
+				hpage_nr_pages(pvmw->page)) {
+			return false;
+		}
+		if (migration_entry_to_page(entry) < pvmw->page)
+			return false;
+#else
+		WARN_ON_ONCE(1);
+#endif
+	} else {
+		if (!pte_present(*pvmw->pte))
+			return false;
+
+		/* THP can be referenced by any subpage */
+		if (pte_page(*pvmw->pte) - pvmw->page >=
+				hpage_nr_pages(pvmw->page)) {
+			return false;
+		}
+		if (pte_page(*pvmw->pte) < pvmw->page)
+			return false;
+	}
+
+	return true;
+}
+
+/**
+ * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
+ * @pvmw->address
+ * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
+ * must be set. pmd, pte and ptl must be NULL.
+ *
+ * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point
+ * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is
+ * adjusted if needed (for PTE-mapped THPs).
+ *
+ * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page
+ * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in
+ * a loop to find all PTEs that map the THP.
+ *
+ * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry
+ * regardless of which page table level the page is mapped at. @pvmw->pmd is
+ * NULL.
+ *
+ * Retruns false if there are no more page table entries for the page in
+ * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
+ *
+ * If you need to stop the walk before page_vma_mapped_walk() returned false,
+ * use page_vma_mapped_walk_done(). It will do the housekeeping.
+ */
+bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+{
+	struct mm_struct *mm = pvmw->vma->vm_mm;
+	struct page *page = pvmw->page;
+	pgd_t *pgd;
+	pud_t *pud;
+
+	/* The only possible pmd mapping has been handled on last iteration */
+	if (pvmw->pmd && !pvmw->pte)
+		return not_found(pvmw);
+
+	/* Only for THP, seek to next pte entry makes sense */
+	if (pvmw->pte) {
+		if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
+			return not_found(pvmw);
+		goto next_pte;
+	}
+
+	if (unlikely(PageHuge(pvmw->page))) {
+		/* when pud is not present, pte will be NULL */
+		pvmw->pte = huge_pte_offset(mm, pvmw->address);
+		if (!pvmw->pte)
+			return false;
+
+		pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+		spin_lock(pvmw->ptl);
+		if (!check_pte(pvmw))
+			return not_found(pvmw);
+		return true;
+	}
+restart:
+	pgd = pgd_offset(mm, pvmw->address);
+	if (!pgd_present(*pgd))
+		return false;
+	pud = pud_offset(pgd, pvmw->address);
+	if (!pud_present(*pud))
+		return false;
+	pvmw->pmd = pmd_offset(pud, pvmw->address);
+	if (pmd_trans_huge(*pvmw->pmd)) {
+		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+		if (!pmd_present(*pvmw->pmd))
+			return not_found(pvmw);
+		if (likely(pmd_trans_huge(*pvmw->pmd))) {
+			if (pvmw->flags & PVMW_MIGRATION)
+				return not_found(pvmw);
+			if (pmd_page(*pvmw->pmd) != page)
+				return not_found(pvmw);
+			return true;
+		} else {
+			/* THP pmd was split under us: handle on pte level */
+			spin_unlock(pvmw->ptl);
+			pvmw->ptl = NULL;
+		}
+	} else {
+		if (!check_pmd(pvmw))
+			return false;
+	}
+	if (!map_pte(pvmw))
+		goto next_pte;
+	while (1) {
+		if (check_pte(pvmw))
+			return true;
+next_pte:	do {
+			pvmw->address += PAGE_SIZE;
+			if (pvmw->address >=
+					__vma_address(pvmw->page, pvmw->vma) +
+					hpage_nr_pages(pvmw->page) * PAGE_SIZE)
+				return not_found(pvmw);
+			/* Did we cross page table boundary? */
+			if (pvmw->address % PMD_SIZE == 0) {
+				pte_unmap(pvmw->pte);
+				if (pvmw->ptl) {
+					spin_unlock(pvmw->ptl);
+					pvmw->ptl = NULL;
+				}
+				goto restart;
+			} else {
+				pvmw->pte++;
+			}
+		} while (pte_none(*pvmw->pte));
+
+		if (!pvmw->ptl) {
+			pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+			spin_lock(pvmw->ptl);
+		}
+	}
+}
-- 
cgit v1.2.3


From c2740dc954212f16f88ef53fd2bbb96986cfd03b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 8 Feb 2017 10:27:32 +1100
Subject: mm: drop page_check_address{,_transhuge}

All users are gone. Let's drop them.

Link: http://lkml.kernel.org/r/20170129173858.45174-12-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  36 --------------
 mm/rmap.c            | 138 ---------------------------------------------------
 2 files changed, 174 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b76343610653..8c89e902df3e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -197,42 +197,6 @@ int page_referenced(struct page *, int is_locked,
 
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
-/*
- * Used by uprobes to replace a userspace page safely
- */
-pte_t *__page_check_address(struct page *, struct mm_struct *,
-				unsigned long, spinlock_t **, int);
-
-static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-					unsigned long address,
-					spinlock_t **ptlp, int sync)
-{
-	pte_t *ptep;
-
-	__cond_lock(*ptlp, ptep = __page_check_address(page, mm, address,
-						       ptlp, sync));
-	return ptep;
-}
-
-/*
- * Used by idle page tracking to check if a page was referenced via page
- * tables.
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
-				  unsigned long address, pmd_t **pmdp,
-				  pte_t **ptep, spinlock_t **ptlp);
-#else
-static inline bool page_check_address_transhuge(struct page *page,
-				struct mm_struct *mm, unsigned long address,
-				pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp)
-{
-	*ptep = page_check_address(page, mm, address, ptlp, 0);
-	*pmdp = NULL;
-	return !!*ptep;
-}
-#endif
-
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
 /* Look for migarion entries rather than present PTEs */
diff --git a/mm/rmap.c b/mm/rmap.c
index 80525820aada..8774791e2809 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -708,144 +708,6 @@ out:
 	return pmd;
 }
 
-/*
- * Check that @page is mapped at @address into @mm.
- *
- * If @sync is false, page_check_address may perform a racy check to avoid
- * the page table lock when the pte is not present (helpful when reclaiming
- * highly shared pages).
- *
- * On success returns with pte mapped and locked.
- */
-pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
-			  unsigned long address, spinlock_t **ptlp, int sync)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	if (unlikely(PageHuge(page))) {
-		/* when pud is not present, pte will be NULL */
-		pte = huge_pte_offset(mm, address);
-		if (!pte)
-			return NULL;
-
-		ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
-		goto check;
-	}
-
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
-		return NULL;
-
-	pte = pte_offset_map(pmd, address);
-	/* Make a quick check before getting the lock */
-	if (!sync && !pte_present(*pte)) {
-		pte_unmap(pte);
-		return NULL;
-	}
-
-	ptl = pte_lockptr(mm, pmd);
-check:
-	spin_lock(ptl);
-	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
-		*ptlp = ptl;
-		return pte;
-	}
-	pte_unmap_unlock(pte, ptl);
-	return NULL;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/*
- * Check that @page is mapped at @address into @mm. In contrast to
- * page_check_address(), this function can handle transparent huge pages.
- *
- * On success returns true with pte mapped and locked. For PMD-mapped
- * transparent huge pages *@ptep is set to NULL.
- */
-bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
-				  unsigned long address, pmd_t **pmdp,
-				  pte_t **ptep, spinlock_t **ptlp)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	spinlock_t *ptl;
-
-	if (unlikely(PageHuge(page))) {
-		/* when pud is not present, pte will be NULL */
-		pte = huge_pte_offset(mm, address);
-		if (!pte)
-			return false;
-
-		ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
-		pmd = NULL;
-		goto check_pte;
-	}
-
-	pgd = pgd_offset(mm, address);
-	if (!pgd_present(*pgd))
-		return false;
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return false;
-	pmd = pmd_offset(pud, address);
-
-	if (pmd_trans_huge(*pmd)) {
-		ptl = pmd_lock(mm, pmd);
-		if (!pmd_present(*pmd))
-			goto unlock_pmd;
-		if (unlikely(!pmd_trans_huge(*pmd))) {
-			spin_unlock(ptl);
-			goto map_pte;
-		}
-
-		if (pmd_page(*pmd) != page)
-			goto unlock_pmd;
-
-		pte = NULL;
-		goto found;
-unlock_pmd:
-		spin_unlock(ptl);
-		return false;
-	} else {
-		pmd_t pmde = *pmd;
-
-		barrier();
-		if (!pmd_present(pmde) || pmd_trans_huge(pmde))
-			return false;
-	}
-map_pte:
-	pte = pte_offset_map(pmd, address);
-	if (!pte_present(*pte)) {
-		pte_unmap(pte);
-		return false;
-	}
-
-	ptl = pte_lockptr(mm, pmd);
-check_pte:
-	spin_lock(ptl);
-
-	if (!pte_present(*pte)) {
-		pte_unmap_unlock(pte, ptl);
-		return false;
-	}
-
-	/* THP can be referenced by any subpage */
-	if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
-		pte_unmap_unlock(pte, ptl);
-		return false;
-	}
-found:
-	*ptep = pte;
-	*pmdp = pmd;
-	*ptlp = ptl;
-	return true;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 struct page_referenced_arg {
 	int mapcount;
 	int referenced;
-- 
cgit v1.2.3


From 013e652dfcf66ce6b79e537748c13ff46d61c0eb Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 10:27:33 +1100
Subject: userfaultfd: non-cooperative: add event for memory unmaps

When a non-cooperative userfaultfd monitor copies pages in the background,
it may encounter regions that were already unmapped.  Addition of
UFFD_EVENT_UNMAP allows the uffd monitor to track precisely changes in the
virtual memory layout.

Since there might be different uffd contexts for the affected VMAs, we
first should create a temporary representation for the unmap event for
each uffd context and then notify them one by one to the appropriate
userfault file descriptors.

The event notification occurs after the mmap_sem has been released.

Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/mips/kernel/vdso.c          |  2 +-
 arch/tile/mm/elf.c               |  2 +-
 arch/x86/entry/vdso/vma.c        |  2 +-
 arch/x86/mm/mpx.c                |  4 +--
 fs/aio.c                         |  2 +-
 fs/proc/vmcore.c                 |  4 +--
 fs/userfaultfd.c                 | 65 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h               | 14 +++++----
 include/linux/userfaultfd_k.h    | 18 +++++++++++
 include/uapi/linux/userfaultfd.h |  3 ++
 ipc/shm.c                        |  6 ++--
 mm/mmap.c                        | 46 ++++++++++++++++++----------
 mm/mremap.c                      | 23 ++++++++------
 mm/util.c                        |  5 +++-
 14 files changed, 155 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index f9dbfb14af33..093517e85a6c 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -111,7 +111,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	base = mmap_region(NULL, STACK_TOP, PAGE_SIZE,
 			   VM_READ|VM_WRITE|VM_EXEC|
 			   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-			   0);
+			   0, NULL);
 	if (IS_ERR_VALUE(base)) {
 		ret = base;
 		goto out;
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 6225cc998db1..889901824400 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -143,7 +143,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
 		unsigned long addr = MEM_USER_INTRPT;
 		addr = mmap_region(NULL, addr, INTRPT_SIZE,
 				   VM_READ|VM_EXEC|
-				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
+				   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0, NULL);
 		if (addr > (unsigned long) -PAGE_SIZE)
 			retval = (int) addr;
 	}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 10820f6cefbf..572cee3fccff 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -186,7 +186,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
 
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
-		do_munmap(mm, text_start, image->size);
+		do_munmap(mm, text_start, image->size, NULL);
 	} else {
 		current->mm->context.vdso = (void __user *)text_start;
 		current->mm->context.vdso_image = image;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index aad4ac386f98..c98079684bdb 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -51,7 +51,7 @@ static unsigned long mpx_mmap(unsigned long len)
 
 	down_write(&mm->mmap_sem);
 	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
+		       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
 	up_write(&mm->mmap_sem);
 	if (populate)
 		mm_populate(addr, populate);
@@ -893,7 +893,7 @@ static int unmap_entire_bt(struct mm_struct *mm,
 	 * avoid recursion, do_munmap() will check whether it comes
 	 * from one bounds table through VM_MPX flag.
 	 */
-	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
+	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
 }
 
 static int try_unmap_single_bt(struct mm_struct *mm,
diff --git a/fs/aio.c b/fs/aio.c
index 873b4ca82ccb..7e2ab9c8e39c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,7 +512,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
 				       PROT_READ | PROT_WRITE,
-				       MAP_SHARED, 0, &unused);
+				       MAP_SHARED, 0, &unused, NULL);
 	up_write(&mm->mmap_sem);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		ctx->mmap_size = 0;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f52d8e857ff7..885d445afa0d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -388,7 +388,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
 	}
 	return 0;
 fail:
-	do_munmap(vma->vm_mm, from, len);
+	do_munmap(vma->vm_mm, from, len, NULL);
 	return -EAGAIN;
 }
 
@@ -481,7 +481,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 
 	return 0;
 fail:
-	do_munmap(vma->vm_mm, vma->vm_start, len);
+	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
 	return -EAGAIN;
 }
 #else
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8fe601b4875e..4c78458ea78d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -71,6 +71,13 @@ struct userfaultfd_fork_ctx {
 	struct list_head list;
 };
 
+struct userfaultfd_unmap_ctx {
+	struct userfaultfd_ctx *ctx;
+	unsigned long start;
+	unsigned long end;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -709,6 +716,64 @@ void userfaultfd_remove(struct vm_area_struct *vma,
 	down_read(&mm->mmap_sem);
 }
 
+static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
+			  unsigned long start, unsigned long end)
+{
+	struct userfaultfd_unmap_ctx *unmap_ctx;
+
+	list_for_each_entry(unmap_ctx, unmaps, list)
+		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
+		    unmap_ctx->end == end)
+			return true;
+
+	return false;
+}
+
+int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+			   unsigned long start, unsigned long end,
+			   struct list_head *unmaps)
+{
+	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+		struct userfaultfd_unmap_ctx *unmap_ctx;
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
+		    has_unmap_ctx(ctx, unmaps, start, end))
+			continue;
+
+		unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
+		if (!unmap_ctx)
+			return -ENOMEM;
+
+		userfaultfd_ctx_get(ctx);
+		unmap_ctx->ctx = ctx;
+		unmap_ctx->start = start;
+		unmap_ctx->end = end;
+		list_add_tail(&unmap_ctx->list, unmaps);
+	}
+
+	return 0;
+}
+
+void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
+{
+	struct userfaultfd_unmap_ctx *ctx, *n;
+	struct userfaultfd_wait_queue ewq;
+
+	list_for_each_entry_safe(ctx, n, uf, list) {
+		msg_init(&ewq.msg);
+
+		ewq.msg.event = UFFD_EVENT_UNMAP;
+		ewq.msg.arg.remove.start = ctx->start;
+		ewq.msg.arg.remove.end = ctx->end;
+
+		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
+
+		list_del(&ctx->list);
+		kfree(ctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7278475c489d..4a1e09316ed1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2086,18 +2086,22 @@ extern int install_special_mapping(struct mm_struct *mm,
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
-	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
+	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+	struct list_head *uf);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
-extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+	vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf);
+extern int do_munmap(struct mm_struct *, unsigned long, size_t,
+		     struct list_head *uf);
 
 static inline unsigned long
 do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot, unsigned long flags,
-	unsigned long pgoff, unsigned long *populate)
+	unsigned long pgoff, unsigned long *populate,
+	struct list_head *uf)
 {
-	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+	return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf);
 }
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 2521542f6c07..a40be5d0661b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -66,6 +66,12 @@ extern void userfaultfd_remove(struct vm_area_struct *vma,
 			       unsigned long start,
 			       unsigned long end);
 
+extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+				  unsigned long start, unsigned long end,
+				  struct list_head *uf);
+extern void userfaultfd_unmap_complete(struct mm_struct *mm,
+				       struct list_head *uf);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -118,6 +124,18 @@ static inline void userfaultfd_remove(struct vm_area_struct *vma,
 				      unsigned long end)
 {
 }
+
+static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
+					 unsigned long start, unsigned long end,
+					 struct list_head *uf)
+{
+	return 0;
+}
+
+static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
+					      struct list_head *uf)
+{
+}
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index b742c40c2880..3b059530dac9 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -21,6 +21,7 @@
 #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
+			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS				\
@@ -110,6 +111,7 @@ struct uffd_msg {
 #define UFFD_EVENT_FORK		0x13
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
+#define UFFD_EVENT_UNMAP	0x16
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -158,6 +160,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_REMOVE		(1<<3)
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
+#define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/ipc/shm.c b/ipc/shm.c
index 7f6537b84ef5..cb95152e2806 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1222,7 +1222,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 			goto invalid;
 	}
 
-	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
+	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL);
 	*raddr = addr;
 	err = 0;
 	if (IS_ERR_VALUE(addr))
@@ -1329,7 +1329,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 			 */
 			file = vma->vm_file;
 			size = i_size_read(file_inode(vma->vm_file));
-			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 			/*
 			 * We discovered the size of the shm segment, so
 			 * break out of here and fall through to the next
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 		if ((vma->vm_ops == &shm_vm_ops) &&
 		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
 		    (vma->vm_file == file))
-			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
+			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
 		vma = next;
 	}
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 13d16a2b7623..1cec28d20583 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static int do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	struct mm_struct *mm = current->mm;
 	unsigned long min_brk;
 	bool populate;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
@@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 
 	/* Always allow shrinking brk. */
 	if (brk <= mm->brk) {
-		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
+		if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
 			goto set_brk;
 		goto out;
 	}
@@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) < 0)
+	if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0)
 		goto out;
 
 set_brk:
 	mm->brk = brk;
 	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	if (populate)
 		mm_populate(oldbrk, newbrk - oldbrk);
 	return brk;
@@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm,
 unsigned long do_mmap(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, vm_flags_t vm_flags,
-			unsigned long pgoff, unsigned long *populate)
+			unsigned long pgoff, unsigned long *populate,
+			struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	int pkey = 0;
@@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 			vm_flags |= VM_NORESERVE;
 	}
 
-	addr = mmap_region(file, addr, len, vm_flags, pgoff);
+	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
 	if (!IS_ERR_VALUE(addr) &&
 	    ((vm_flags & VM_LOCKED) ||
 	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
@@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
-		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
+		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+		struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/* Clear old maps */
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
-		if (do_munmap(mm, addr, len))
+		if (do_munmap(mm, addr, len, uf))
 			return -ENOMEM;
 	}
 
@@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
  * work.  This now handles partial unmappings.
  * Jeremy Fitzhardinge <jeremy@goop.org>
  */
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+	      struct list_head *uf)
 {
 	unsigned long end;
 	struct vm_area_struct *vma, *prev, *last;
@@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	if (vma->vm_start >= end)
 		return 0;
 
+	if (uf) {
+		int error = userfaultfd_unmap_prep(vma, start, end, uf);
+
+		if (error)
+			return error;
+	}
+
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
@@ -2668,12 +2680,14 @@ int vm_munmap(unsigned long start, size_t len)
 {
 	int ret;
 	struct mm_struct *mm = current->mm;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_munmap(mm, start, len);
+	ret = do_munmap(mm, start, len, &uf);
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	return ret;
 }
 EXPORT_SYMBOL(vm_munmap);
@@ -2773,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 
 	file = get_file(vma->vm_file);
 	ret = do_mmap_pgoff(vma->vm_file, start, size,
-			prot, flags, pgoff, &populate);
+			prot, flags, pgoff, &populate, NULL);
 	fput(file);
 out:
 	up_write(&mm->mmap_sem);
@@ -2799,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -2838,7 +2852,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
 	 */
 	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
 			      &rb_parent)) {
-		if (do_munmap(mm, addr, len))
+		if (do_munmap(mm, addr, len, uf))
 			return -ENOMEM;
 	}
 
@@ -2885,9 +2899,9 @@ out:
 	return 0;
 }
 
-static int do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf)
 {
-	return do_brk_flags(addr, len, 0);
+	return do_brk_flags(addr, len, 0, uf);
 }
 
 int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
@@ -2895,13 +2909,15 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
 	struct mm_struct *mm = current->mm;
 	int ret;
 	bool populate;
+	LIST_HEAD(uf);
 
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
-	ret = do_brk_flags(addr, len, flags);
+	ret = do_brk_flags(addr, len, flags, &uf);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
+	userfaultfd_unmap_complete(mm, &uf);
 	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8779928d6a70..8233b0105c82 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -252,7 +252,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
 		unsigned long new_len, unsigned long new_addr,
-		bool *locked, struct vm_userfaultfd_ctx *uf)
+		bool *locked, struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -341,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn_moved(vma);
 
-	if (do_munmap(mm, old_addr, old_len) < 0) {
+	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
@@ -417,7 +418,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
-		struct vm_userfaultfd_ctx *uf)
+		struct vm_userfaultfd_ctx *uf,
+		struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -435,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (addr + old_len > new_addr && new_addr + new_len > addr)
 		goto out;
 
-	ret = do_munmap(mm, new_addr, new_len);
+	ret = do_munmap(mm, new_addr, new_len, NULL);
 	if (ret)
 		goto out;
 
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
 		if (ret && old_len != new_len)
 			goto out;
 		old_len = new_len;
@@ -462,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+		       uf_unmap);
 	if (!(offset_in_page(ret)))
 		goto out;
 out1:
@@ -502,6 +505,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long charged = 0;
 	bool locked = false;
 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+	LIST_HEAD(uf_unmap);
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;
@@ -528,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf);
+				&locked, &uf, &uf_unmap);
 		goto out;
 	}
 
@@ -538,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	 * do_munmap does all the needed commit accounting
 	 */
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
+		ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
 		if (ret && old_len != new_len)
 			goto out;
 		ret = addr;
@@ -598,7 +602,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 
 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
-			       &locked, &uf);
+			       &locked, &uf, &uf_unmap);
 	}
 out:
 	if (offset_in_page(ret)) {
@@ -609,5 +613,6 @@ out:
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
 	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+	userfaultfd_unmap_complete(mm, &uf_unmap);
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
index 3cb2164f4099..b8f538863b5a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,6 +11,7 @@
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/sections.h>
 #include <linux/uaccess.h>
@@ -297,14 +298,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
 	unsigned long populate;
+	LIST_HEAD(uf);
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
 		if (down_write_killable(&mm->mmap_sem))
 			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
-				    &populate);
+				    &populate, &uf);
 		up_write(&mm->mmap_sem);
+		userfaultfd_unmap_complete(mm, &uf);
 		if (populate)
 			mm_populate(ret, populate);
 	}
-- 
cgit v1.2.3


From bc3dca5a3fb6ae1312d64c76adf889e8b14b5b04 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 8 Feb 2017 10:27:35 +1100
Subject: userfaultfd: non-cooperative: add event for exit() notification

Allow userfaultfd monitor track termination of the processes that have
memory backed by the uffd.

Link: http://lkml.kernel.org/r/1485542673-24387-4-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c                 | 24 ++++++++++++++++++++++++
 include/linux/userfaultfd_k.h    |  7 +++++++
 include/uapi/linux/userfaultfd.h |  5 ++++-
 kernel/exit.c                    |  2 ++
 4 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 4c78458ea78d..73b682519430 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -774,6 +774,30 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
 	}
 }
 
+void userfaultfd_exit(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma = mm->mmap;
+
+	while (vma) {
+		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+		if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) {
+			struct userfaultfd_wait_queue ewq;
+
+			userfaultfd_ctx_get(ctx);
+
+			msg_init(&ewq.msg);
+			ewq.msg.event = UFFD_EVENT_EXIT;
+
+			userfaultfd_event_wait_completion(ctx, &ewq);
+
+			ctx->features &= ~UFFD_FEATURE_EVENT_EXIT;
+		}
+
+		vma = vma->vm_next;
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a40be5d0661b..0468548acebf 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -72,6 +72,8 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
 extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 				       struct list_head *uf);
 
+extern void userfaultfd_exit(struct mm_struct *mm);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -136,6 +138,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm,
 					      struct list_head *uf)
 {
 }
+
+static inline void userfaultfd_exit(struct mm_struct *mm)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..c055947c5c98 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -18,7 +18,8 @@
  * means the userland is reading).
  */
 #define UFFD_API ((__u64)0xAA)
-#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT |		\
+			   UFFD_FEATURE_EVENT_FORK |		\
 			   UFFD_FEATURE_EVENT_REMAP |		\
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
@@ -112,6 +113,7 @@ struct uffd_msg {
 #define UFFD_EVENT_REMAP	0x14
 #define UFFD_EVENT_REMOVE	0x15
 #define UFFD_EVENT_UNMAP	0x16
+#define UFFD_EVENT_EXIT		0x17
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -161,6 +163,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_EVENT_EXIT			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b866f9f6..376d25b550b4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
@@ -518,6 +519,7 @@ static void exit_mm(struct task_struct *tsk)
 	enter_lazy_tlb(mm, current);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
+	userfaultfd_exit(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
 		exit_oom_victim();
-- 
cgit v1.2.3


From 6b4b7141c2b9a4db8d06fd82bf2b8e09165dec25 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Wed, 8 Feb 2017 10:27:37 +1100
Subject: mm: alloc_contig_range: allow to specify GFP mask

Currently alloc_contig_range assumes that the compaction should be done
with the default GFP_KERNEL flags.  This is probably right for all current
uses of this interface, but may change as CMA is used in more use-cases
(including being the default DMA memory allocator on some platforms).

Change the function prototype, to allow for passing through the GFP mask
set by upper layers.

Also respect global restrictions by applying memalloc_noio_flags to the
passed in flags.

Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 mm/cma.c            | 3 ++-
 mm/hugetlb.c        | 3 ++-
 mm/page_alloc.c     | 5 +++--
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0fe0b6295ab5..db373b9d3223 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -541,7 +541,7 @@ static inline bool pm_suspended_storage(void)
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
-			      unsigned migratetype);
+			      unsigned migratetype, gfp_t gfp_mask);
 extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
 #endif
 
diff --git a/mm/cma.c b/mm/cma.c
index c960459eda7e..fbd67d866f67 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -407,7 +407,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma_mutex);
-		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
+		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
+					 GFP_KERNEL);
 		mutex_unlock(&cma_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1b8789a2395d..d0d1d083c432 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
 				unsigned long nr_pages)
 {
 	unsigned long end_pfn = start_pfn + nr_pages;
-	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
+				  GFP_KERNEL);
 }
 
 static bool pfn_range_valid_gigantic(struct zone *z,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a962a8a1858d..2325fa09030f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7401,6 +7401,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
+ * @gfp_mask:	GFP mask to use during compaction
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
@@ -7414,7 +7415,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
-		       unsigned migratetype)
+		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
 	unsigned int order;
@@ -7426,7 +7427,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
-		.gfp_mask = GFP_KERNEL,
+		.gfp_mask = memalloc_noio_flags(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
-- 
cgit v1.2.3


From 8dc16a2a8be58d109e74feb2695659b96044c037 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Wed, 8 Feb 2017 10:27:37 +1100
Subject: mm: cma_alloc: allow to specify GFP mask

Most users of this interface just want to use it with the default
GFP_KERNEL flags, but for cases where DMA memory is allocated it may be
called from a different context.

No functional change yet, just passing through the flag to the underlying
alloc_contig_range function.

Link: http://lkml.kernel.org/r/20170127172328.18574-2-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_hv_builtin.c | 3 ++-
 drivers/base/dma-contiguous.c        | 2 +-
 include/linux/cma.h                  | 3 ++-
 mm/cma.c                             | 5 +++--
 mm/cma_debug.c                       | 2 +-
 5 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5bb24be0b346..56a62d97ab2d 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -56,7 +56,8 @@ struct page *kvm_alloc_hpt(unsigned long nr_pages)
 {
 	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
 
-	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
+	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES),
+			 GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
 
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index e167a1e1bccb..d1a9cbabc627 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -193,7 +193,7 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
 	if (align > CONFIG_CMA_ALIGNMENT)
 		align = CONFIG_CMA_ALIGNMENT;
 
-	return cma_alloc(dev_get_cma_area(dev), count, align);
+	return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL);
 }
 
 /**
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 6f0a91b37f68..03f32d0bd1d8 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -29,6 +29,7 @@ extern int __init cma_declare_contiguous(phys_addr_t base,
 extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
 					struct cma **res_cma);
-extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align);
+extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+			      gfp_t gfp_mask);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
 #endif
diff --git a/mm/cma.c b/mm/cma.c
index fbd67d866f67..a33ddfde315d 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -362,7 +362,8 @@ err:
  * This function allocates part of contiguous memory on specific
  * contiguous memory area.
  */
-struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
+		       gfp_t gfp_mask)
 {
 	unsigned long mask, offset;
 	unsigned long pfn = -1;
@@ -408,7 +409,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
 		pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma_mutex);
 		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
-					 GFP_KERNEL);
+					 gfp_mask);
 		mutex_unlock(&cma_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index f8e4b60db167..ffc0c3d0ae64 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -138,7 +138,7 @@ static int cma_alloc_mem(struct cma *cma, int count)
 	if (!mem)
 		return -ENOMEM;
 
-	p = cma_alloc(cma, count, 0);
+	p = cma_alloc(cma, count, 0, GFP_KERNEL);
 	if (!p) {
 		kfree(mem);
 		return -ENOMEM;
-- 
cgit v1.2.3


From 76bc036d3fae93fc333dd62952f04bcb50e7d0c6 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Wed, 8 Feb 2017 10:27:38 +1100
Subject: mm: wire up GFP flag passing in dma_alloc_from_contiguous

The callers of the DMA alloc functions already provide the proper context
GFP flags.  Make sure to pass them through to the CMA allocator, to make
the CMA compaction context aware.

Link: http://lkml.kernel.org/r/20170127172328.18574-3-l.stach@pengutronix.de
Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alexander Graf <agraf@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/mm/dma-mapping.c      | 16 +++++++++-------
 arch/arm64/mm/dma-mapping.c    |  4 ++--
 arch/mips/mm/dma-default.c     |  4 ++--
 arch/x86/kernel/pci-dma.c      |  3 ++-
 arch/xtensa/kernel/pci-dma.c   |  3 ++-
 drivers/base/dma-contiguous.c  |  5 +++--
 drivers/iommu/amd_iommu.c      |  2 +-
 drivers/iommu/intel-iommu.c    |  2 +-
 include/linux/dma-contiguous.h |  4 ++--
 9 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ab7710002ba6..4d6ec7d821c8 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -349,7 +349,7 @@ static void __dma_free_buffer(struct page *page, size_t size)
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
 				     pgprot_t prot, struct page **ret_page,
 				     const void *caller, bool want_vaddr,
-				     int coherent_flag);
+				     int coherent_flag, gfp_t gfp);
 
 static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
 				 pgprot_t prot, struct page **ret_page,
@@ -420,7 +420,8 @@ static int __init atomic_pool_init(void)
 	 */
 	if (dev_get_cma_area(NULL))
 		ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot,
-				      &page, atomic_pool_init, true, NORMAL);
+				      &page, atomic_pool_init, true, NORMAL,
+				      GFP_KERNEL);
 	else
 		ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot,
 					   &page, atomic_pool_init, true);
@@ -594,14 +595,14 @@ static int __free_from_pool(void *start, size_t size)
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
 				     pgprot_t prot, struct page **ret_page,
 				     const void *caller, bool want_vaddr,
-				     int coherent_flag)
+				     int coherent_flag, gfp_t gfp)
 {
 	unsigned long order = get_order(size);
 	size_t count = size >> PAGE_SHIFT;
 	struct page *page;
 	void *ptr = NULL;
 
-	page = dma_alloc_from_contiguous(dev, count, order);
+	page = dma_alloc_from_contiguous(dev, count, order, gfp);
 	if (!page)
 		return NULL;
 
@@ -655,7 +656,7 @@ static inline pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot)
 #define __get_dma_pgprot(attrs, prot)				__pgprot(0)
 #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c, wv)	NULL
 #define __alloc_from_pool(size, ret_page)			NULL
-#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, coherent_flag)	NULL
+#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, coherent_flag, gfp)	NULL
 #define __free_from_pool(cpu_addr, size)			do { } while (0)
 #define __free_from_contiguous(dev, page, cpu_addr, size, wv)	do { } while (0)
 #define __dma_free_remap(cpu_addr, size)			do { } while (0)
@@ -697,7 +698,8 @@ static void *cma_allocator_alloc(struct arm_dma_alloc_args *args,
 {
 	return __alloc_from_contiguous(args->dev, args->size, args->prot,
 				       ret_page, args->caller,
-				       args->want_vaddr, args->coherent_flag);
+				       args->want_vaddr, args->coherent_flag,
+				       args->gfp);
 }
 
 static void cma_allocator_free(struct arm_dma_free_args *args)
@@ -1293,7 +1295,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
 		unsigned long order = get_order(size);
 		struct page *page;
 
-		page = dma_alloc_from_contiguous(dev, count, order);
+		page = dma_alloc_from_contiguous(dev, count, order, gfp);
 		if (!page)
 			goto error;
 
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index e04082700bb1..b998f56e6a53 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -107,7 +107,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
 		void *addr;
 
 		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-							get_order(size));
+						 get_order(size), flags);
 		if (!page)
 			return NULL;
 
@@ -379,7 +379,7 @@ static int __init atomic_pool_init(void)
 
 	if (dev_get_cma_area(NULL))
 		page = dma_alloc_from_contiguous(NULL, nr_pages,
-							pool_size_order);
+						 pool_size_order, GFP_KERNEL);
 	else
 		page = alloc_pages(GFP_DMA, pool_size_order);
 
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index a39c36af97ad..1895a692efd4 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -148,8 +148,8 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 	gfp = massage_gfp_flags(dev, gfp);
 
 	if (IS_ENABLED(CONFIG_DMA_CMA) && gfpflags_allow_blocking(gfp))
-		page = dma_alloc_from_contiguous(dev,
-					count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 gfp);
 	if (!page)
 		page = alloc_pages(gfp, get_order(size));
 
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..d5c223c9cf11 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -91,7 +91,8 @@ again:
 	page = NULL;
 	/* CMA can be used only in the context which permits sleeping */
 	if (gfpflags_allow_blocking(flag)) {
-		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 flag);
 		if (page && page_to_phys(page) + size > dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
 			page = NULL;
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 70e362e6038e..34c1f9fa6acc 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -158,7 +158,8 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size,
 		flag |= GFP_DMA;
 
 	if (gfpflags_allow_blocking(flag))
-		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 flag);
 
 	if (!page)
 		page = alloc_pages(flag, get_order(size));
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index d1a9cbabc627..b55804cac4c4 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -181,6 +181,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
  * @dev:   Pointer to device for which the allocation is performed.
  * @count: Requested number of pages.
  * @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @gfp_mask: GFP flags to use for this allocation.
  *
  * This function allocates memory buffer for specified device. It uses
  * device specific contiguous memory area if available or the default
@@ -188,12 +189,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
  * function.
  */
 struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
-				       unsigned int align)
+				       unsigned int align, gfp_t gfp_mask)
 {
 	if (align > CONFIG_CMA_ALIGNMENT)
 		align = CONFIG_CMA_ALIGNMENT;
 
-	return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL);
+	return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask);
 }
 
 /**
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 3ef0f42984f2..bd2e335cdf39 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2668,7 +2668,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
 			return NULL;
 
 		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-						 get_order(size));
+						 get_order(size), flag);
 		if (!page)
 			return NULL;
 	}
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 8a185250ae5a..3f804eb4299a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -3827,7 +3827,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size,
 	if (gfpflags_allow_blocking(flags)) {
 		unsigned int count = size >> PAGE_SHIFT;
 
-		page = dma_alloc_from_contiguous(dev, count, order);
+		page = dma_alloc_from_contiguous(dev, count, order, flags);
 		if (page && iommu_no_mapping(dev) &&
 		    page_to_phys(page) + size > dev->coherent_dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index fec734df1524..b67bf6ac907d 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 }
 
 struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
-				       unsigned int order);
+				       unsigned int order, gfp_t gfp_mask);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 				 int count);
 
@@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size,
 
 static inline
 struct page *dma_alloc_from_contiguous(struct device *dev, size_t count,
-				       unsigned int order)
+				       unsigned int order, gfp_t gfp_mask)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From f09d0de5d2b7d47830aba3c493763cff016c355b Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 8 Feb 2017 10:27:38 +1100
Subject: mm, madvise: fail with ENOMEM when splitting vma will hit
 max_map_count

If madvise(2) advice will result in the underlying vma being split and the
number of areas mapped by the process will exceed
/proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN.

EAGAIN is returned by madvise(2) when a kernel resource, such as slab, is
temporarily unavailable.  It indicates that userspace should retry the
advice in the near future.  This is important for advice such as
MADV_DONTNEED which is often used by malloc implementations to free memory
back to the system: we really do want to free memory back when madvise(2)
returns EAGAIN because slab allocations (for vmas, anon_vmas, or
mempolicies) cannot be allocated.

Encountering /proc/sys/vm/max_map_count is not a temporary failure,
however, so return ENOMEM to indicate this is a more serious issue.  A
followup patch to the man page will specify this behavior.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/sysctl/vm.txt |  4 ++--
 Documentation/vm/ksm.txt    |  4 ++++
 include/linux/mm.h          |  6 ++++--
 mm/madvise.c                | 51 +++++++++++++++++++++++++++++++++++++--------
 mm/mmap.c                   |  8 +++----
 5 files changed, 56 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 95ccbe6d79ce..b4ad97f10b8e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -376,8 +376,8 @@ max_map_count:
 
 This file contains the maximum number of memory map areas a process
 may have. Memory map areas are used as a side-effect of calling
-malloc, directly by mmap and mprotect, and also when loading shared
-libraries.
+malloc, directly by mmap, mprotect, and madvise, and also when loading
+shared libraries.
 
 While most applications need less than a thousand maps, certain
 programs, particularly malloc debuggers, may consume lots of them,
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index 0c64a81d6808..6b0ca7feb135 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -38,6 +38,10 @@ the range for whenever the KSM daemon is started; even if the range
 cannot contain any pages which KSM could actually merge; even if
 MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
 
+If a region of memory must be split into at least one new MADV_MERGEABLE
+or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
+will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
+
 Like other madvise calls, they are intended for use on mapped areas of
 the user address space: they will report ENOMEM if the specified range
 includes unmapped gaps (though working on the intervening mapped areas),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4a1e09316ed1..2be2ea04b1c9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2037,8 +2037,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
 	struct mempolicy *, struct vm_userfaultfd_ctx);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int split_vma(struct mm_struct *,
-	struct vm_area_struct *, unsigned long addr, int new_below);
+extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
+	unsigned long addr, int new_below);
+extern int split_vma(struct mm_struct *, struct vm_area_struct *,
+	unsigned long addr, int new_below);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
diff --git a/mm/madvise.c b/mm/madvise.c
index 0012071a6e50..11fc65f81ecd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -92,14 +92,28 @@ static long madvise_behavior(struct vm_area_struct *vma,
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error)
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
 			goto out;
+		}
 		break;
 	case MADV_HUGEPAGE:
 	case MADV_NOHUGEPAGE:
 		error = hugepage_madvise(vma, &new_flags, behavior);
-		if (error)
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
 			goto out;
+		}
 		break;
 	}
 
@@ -120,15 +134,37 @@ static long madvise_behavior(struct vm_area_struct *vma,
 	*prev = vma;
 
 	if (start != vma->vm_start) {
-		error = split_vma(mm, vma, start, 1);
-		if (error)
+		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+			error = -ENOMEM;
 			goto out;
+		}
+		error = __split_vma(mm, vma, start, 1);
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
+			goto out;
+		}
 	}
 
 	if (end != vma->vm_end) {
-		error = split_vma(mm, vma, end, 0);
-		if (error)
+		if (unlikely(mm->map_count >= sysctl_max_map_count)) {
+			error = -ENOMEM;
+			goto out;
+		}
+		error = __split_vma(mm, vma, end, 0);
+		if (error) {
+			/*
+			 * madvise() returns EAGAIN if kernel resources, such as
+			 * slab, are temporarily unavailable.
+			 */
+			if (error == -ENOMEM)
+				error = -EAGAIN;
 			goto out;
+		}
 	}
 
 success:
@@ -136,10 +172,7 @@ success:
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
-
 out:
-	if (error == -ENOMEM)
-		error = -EAGAIN;
 	return error;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1cec28d20583..499b988b1639 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2499,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 /*
- * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
- * munmap path where it doesn't make sense to fail.
+ * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
+ * has already been checked or doesn't make sense to fail.
  */
-static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
-	      unsigned long addr, int new_below)
+int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long addr, int new_below)
 {
 	struct vm_area_struct *new;
 	int err;
-- 
cgit v1.2.3


From b62715ac3c44416a899309496509758258b643e1 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Wed, 8 Feb 2017 10:27:47 +1100
Subject: kasan: drain quarantine of memcg slab objects

Per memcg slab accounting and kasan have a problem with kmem_cache
destruction.
- kmem_cache_create() allocates a kmem_cache, which is used for
  allocations from processes running in root (top) memcg.
- Processes running in non root memcg and allocating with either
  __GFP_ACCOUNT or from a SLAB_ACCOUNT cache use a per memcg kmem_cache.
- Kasan catches use-after-free by having kfree() and kmem_cache_free()
  defer freeing of objects.  Objects are placed in a quarantine.
- kmem_cache_destroy() destroys root and non root kmem_caches.  It takes
  care to drain the quarantine of objects from the root memcg's
  kmem_cache, but ignores objects associated with non root memcg.  This
  causes leaks because quarantined per memcg objects refer to per memcg
  kmem cache being destroyed.

To see the problem:
1) create a slab cache with kmem_cache_create(,,,SLAB_ACCOUNT,)
2) from non root memcg, allocate and free a few objects from cache
3) dispose of the cache with kmem_cache_destroy()
kmem_cache_destroy() will trigger a "Slab cache still has objects"
warning indicating that the per memcg kmem_cache structure was leaked.

Fix the leak by draining kasan quarantined objects allocated from non
root memcg.

Racing memcg deletion is tricky, but handled.  kmem_cache_destroy() =>
shutdown_memcg_caches() => __shutdown_memcg_cache() => shutdown_cache()
flushes per memcg quarantined objects, even if that memcg has been
rmdir'd and gone through memcg_deactivate_kmem_caches().

This leak only affects destroyed SLAB_ACCOUNT kmem caches when kasan is
enabled.  So I don't think it's worth patching stable kernels.

Link: http://lkml.kernel.org/r/1482257462-36948-1-git-send-email-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 4 ++--
 mm/kasan/kasan.c      | 2 +-
 mm/kasan/quarantine.c | 1 +
 mm/slab_common.c      | 4 +++-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 820c0ad54a01..c908b25bf5a5 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -52,7 +52,7 @@ void kasan_free_pages(struct page *page, unsigned int order);
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
 void kasan_cache_shrink(struct kmem_cache *cache);
-void kasan_cache_destroy(struct kmem_cache *cache);
+void kasan_cache_shutdown(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -98,7 +98,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
-static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
+static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b2a0cff2bb35..25f0e6521f36 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -435,7 +435,7 @@ void kasan_cache_shrink(struct kmem_cache *cache)
 	quarantine_remove_cache(cache);
 }
 
-void kasan_cache_destroy(struct kmem_cache *cache)
+void kasan_cache_shutdown(struct kmem_cache *cache)
 {
 	quarantine_remove_cache(cache);
 }
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index dae929c02bbb..6f1ed1630873 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -274,6 +274,7 @@ static void per_cpu_remove_cache(void *arg)
 	qlist_free_all(&to_free, cache);
 }
 
+/* Free all quarantined objects belonging to cache. */
 void quarantine_remove_cache(struct kmem_cache *cache)
 {
 	unsigned long flags, i;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2a36518609b0..16b39eeb208a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -528,6 +528,9 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 
 static int shutdown_cache(struct kmem_cache *s)
 {
+	/* free asan quarantined objects */
+	kasan_cache_shutdown(s);
+
 	if (__kmem_cache_shutdown(s) != 0)
 		return -EBUSY;
 
@@ -816,7 +819,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
-	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
-- 
cgit v1.2.3


From d0b6e4bc04a661983a77e815e86f19890c227b3d Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@chromium.org>
Date: Wed, 8 Feb 2017 10:27:49 +1100
Subject: uapi: mqueue.h: add missing linux/types.h include

Commit 63159f5dcccb385 ("uapi: Use __kernel_long_t in struct mq_attr")
changed the types from long to __kernel_long_t, but didn't add a
linux/types.h include.  Code that tries to include this header directly
breaks:

/usr/include/linux/mqueue.h:26:2: error: unknown type name '__kernel_long_t'
  __kernel_long_t mq_flags; /* message queue flags   */

This also upsets configure tests for this header:
checking linux/mqueue.h usability... no
checking linux/mqueue.h presence... yes
configure: WARNING: linux/mqueue.h: present but cannot be compiled
configure: WARNING: linux/mqueue.h:     check for missing prerequisite headers?
configure: WARNING: linux/mqueue.h: see the Autoconf documentation
configure: WARNING: linux/mqueue.h:     section "Present But Cannot Be Compiled"
configure: WARNING: linux/mqueue.h: proceeding with the compiler's result
checking for linux/mqueue.h... no

Link: http://lkml.kernel.org/r/20170119194644.4403-1-vapier@gentoo.org
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/mqueue.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/mqueue.h b/include/uapi/linux/mqueue.h
index d0a2b8e89813..bbd5116ea739 100644
--- a/include/uapi/linux/mqueue.h
+++ b/include/uapi/linux/mqueue.h
@@ -18,6 +18,8 @@
 #ifndef _LINUX_MQUEUE_H
 #define _LINUX_MQUEUE_H
 
+#include <linux/types.h>
+
 #define MQ_PRIO_MAX 	32768
 /* per-uid limit of kernel memory used by mqueue, in bytes */
 #define MQ_BYTES_MAX	819200
-- 
cgit v1.2.3


From 468be46cbebfd826e9705368d96c89187f241fee Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 8 Feb 2017 10:27:50 +1100
Subject: include/linux/iopoll.h: include <linux/ktime.h> instead of
 <linux/hrtimer.h>

The timer APIs this header needs are ktime_get(), ktime_add_us(), and
ktime_compare().  So, including <linux/ktime.h> seems enough.  This commit
will cut unnecessary header file parsing.

Link: http://lkml.kernel.org/r/1481679225-10885-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/iopoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index 1c30014ed176..d29e1e21bf3f 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -17,7 +17,7 @@
 
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/hrtimer.h>
+#include <linux/ktime.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/io.h>
-- 
cgit v1.2.3


From bf9058f08772c20b5c32ebe369753d4d9e59b986 Mon Sep 17 00:00:00 2001
From: Jinbum Park <jinb.park7@gmail.com>
Date: Wed, 8 Feb 2017 10:27:50 +1100
Subject: mm: add arch-independent testcases for RODATA

This patch makes arch-independent testcases for RODATA.  Both x86 and
x86_64 already have testcases for RODATA, But they are arch-specific
because using inline assembly directly.

And cacheflush.h is not a suitable location for rodata-test related
things.  Since they were in cacheflush.h, If someone change the state of
CONFIG_DEBUG_RODATA_TEST, It cause overhead of kernel build.

To solve the above issues, write arch-independent testcases and move it to
shared location.

Link: http://lkml.kernel.org/r/20170129105436.GA9303@pjb1027-Latitude-E5410
Signed-off-by: Jinbum Park <jinb.park7@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/Kconfig.debug            | 10 +-----
 arch/x86/include/asm/cacheflush.h | 10 ------
 arch/x86/kernel/Makefile          |  1 -
 arch/x86/kernel/test_rodata.c     | 75 ---------------------------------------
 arch/x86/mm/init_32.c             |  4 ---
 arch/x86/mm/init_64.c             |  5 ---
 include/linux/rodata_test.h       | 24 +++++++++++++
 init/main.c                       |  6 ++--
 mm/Kconfig.debug                  |  7 ++++
 mm/Makefile                       |  1 +
 mm/rodata_test.c                  | 56 +++++++++++++++++++++++++++++
 11 files changed, 93 insertions(+), 106 deletions(-)
 delete mode 100644 arch/x86/kernel/test_rodata.c
 create mode 100644 include/linux/rodata_test.h
 create mode 100644 mm/rodata_test.c

(limited to 'include')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 67eec55093a5..3fa469c82707 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,14 +74,6 @@ config EFI_PGT_DUMP
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
-config DEBUG_RODATA_TEST
-	bool "Testcase for the marking rodata read-only"
-	default y
-	---help---
-	  This option enables a testcase for the setting rodata read-only
-	  as well as for the change_page_attr() infrastructure.
-	  If in doubt, say "N"
-
 config DEBUG_WX
 	bool "Warn on W+X mappings at boot"
 	select X86_PTDUMP_CORE
@@ -122,7 +114,7 @@ config DEBUG_SET_MODULE_RONX
 
 config DEBUG_NX_TEST
 	tristate "Testcase for the NX non-executable stack feature"
-	depends on DEBUG_KERNEL && m
+	depends on DEBUG_KERNEL && DEBUG_RODATA_TEST && m
 	---help---
 	  This option enables a testcase for the CPU NX capability
 	  and the software setup of this feature.
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 872877d930de..e7e1942edff7 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -90,18 +90,8 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 
-#ifdef CONFIG_DEBUG_RODATA_TEST
-int rodata_test(void);
-#else
-static inline int rodata_test(void)
-{
-	return 0;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 581386c7e429..f6caf829085a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,7 +100,6 @@ obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
-obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index 222e84e2432e..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-#include <asm/asm.h>
-
-int rodata_test(void)
-{
-	unsigned long result;
-	unsigned long start, end;
-
-	/* test 1: read the value */
-	/* If this test fails, some previous testrun has clobbered the state */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
-		return -ENODEV;
-	}
-
-	/* test 2: write to the variable; this should fault */
-	/*
-	 * If this test fails, we managed to overwrite the data
-	 *
-	 * This is written in assembly to be able to catch the
-	 * exception that is supposed to happen in the correct
-	 * case
-	 */
-
-	result = 1;
-	asm volatile(
-		"0:	mov %[zero],(%[rodata_test])\n"
-		"	mov %[zero], %[rslt]\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		: [rslt] "=r" (result)
-		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
-	);
-
-
-	if (!result) {
-		printk(KERN_ERR "rodata_test: test data was not read only\n");
-		return -ENODEV;
-	}
-
-	/* test 3: check the value hasn't changed */
-	/* If this test fails, we managed to overwrite the data */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
-		return -ENODEV;
-	}
-	/* test 4: check if the rodata section is 4Kb aligned */
-	start = (unsigned long)__start_rodata;
-	end = (unsigned long)__end_rodata;
-	if (start & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
-		return -ENODEV;
-	}
-	if (end & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 928d657de829..2b4b53e6793f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -864,9 +864,6 @@ static noinline int do_test_wp_bit(void)
 	return flag;
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly __read_mostly;
 
 void set_kernel_text_rw(void)
@@ -939,7 +936,6 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 		size >> 10);
-	rodata_test();
 
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5fff9138c442..a4880d8143c7 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1011,9 +1011,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
-
 int kernel_set_to_readonly;
 
 void set_kernel_text_rw(void)
@@ -1082,8 +1079,6 @@ void mark_rodata_ro(void)
 	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
 	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
 
-	rodata_test();
-
 #ifdef CONFIG_CPA_DEBUG
 	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
 	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
new file mode 100644
index 000000000000..562537f85a28
--- /dev/null
+++ b/include/linux/rodata_test.h
@@ -0,0 +1,24 @@
+/*
+ * rodata_test.h: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifndef _RODATA_TEST_H
+#define _RODATA_TEST_H
+
+#ifdef CONFIG_DEBUG_RODATA_TEST
+extern const int rodata_test_data;
+void rodata_test(void);
+#else
+static inline void rodata_test(void) {}
+#endif
+
+#endif /* _RODATA_TEST_H */
+
diff --git a/init/main.c b/init/main.c
index b0c9d6facef9..88bdaa9a5cde 100644
--- a/init/main.c
+++ b/init/main.c
@@ -82,6 +82,7 @@
 #include <linux/proc_ns.h>
 #include <linux/io.h>
 #include <linux/cache.h>
+#include <linux/rodata_test.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -937,9 +938,10 @@ __setup("rodata=", set_debug_rodata);
 #ifdef CONFIG_DEBUG_RODATA
 static void mark_readonly(void)
 {
-	if (rodata_enabled)
+	if (rodata_enabled) {
 		mark_rodata_ro();
-	else
+		rodata_test();
+	} else
 		pr_info("Kernel memory protection disabled.\n");
 }
 #else
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index afcc550877ff..3e5eada506df 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,3 +90,10 @@ config DEBUG_PAGE_REF
 	  careful when enabling this feature because it adds about 30 KB to the
 	  kernel code.  However the runtime performance overhead is virtually
 	  nil until the tracepoints are actually enabled.
+
+config DEBUG_RODATA_TEST
+    bool "Testcase for the marking rodata read-only"
+    depends on DEBUG_RODATA
+    ---help---
+      This option enables a testcase for the setting rodata read-only.
+
diff --git a/mm/Makefile b/mm/Makefile
index aa0aa17cb413..026f6a828a50 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
new file mode 100644
index 000000000000..0fd21670b513
--- /dev/null
+++ b/mm/rodata_test.c
@@ -0,0 +1,56 @@
+/*
+ * rodata_test.c: functional test for mark_rodata_ro function
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+void rodata_test(void)
+{
+	unsigned long start, end;
+	int zero = 0;
+
+	/* test 1: read the value */
+	/* If this test fails, some previous testrun has clobbered the state */
+	if (!rodata_test_data) {
+		pr_err("rodata_test: test 1 fails (start data)\n");
+		return;
+	}
+
+	/* test 2: write to the variable; this should fault */
+	if (!probe_kernel_write((void *)&rodata_test_data,
+						(void *)&zero, sizeof(zero))) {
+		pr_err("rodata_test: test data was not read only\n");
+		return;
+	}
+
+	/* test 3: check the value hasn't changed */
+	if (rodata_test_data == zero) {
+		pr_err("rodata_test: test data was changed\n");
+		return;
+	}
+
+	/* test 4: check if the rodata section is PAGE_SIZE aligned */
+	start = (unsigned long)__start_rodata;
+	end = (unsigned long)__end_rodata;
+	if (start & (PAGE_SIZE - 1)) {
+		pr_err("rodata_test: start of .rodata is not page size aligned\n");
+		return;
+	}
+	if (end & (PAGE_SIZE - 1)) {
+		pr_err("rodata_test: end of .rodata is not page size aligned\n");
+		return;
+	}
+
+	pr_info("rodata_test: all tests were successful\n");
+}
-- 
cgit v1.2.3


From b0984729e8ba03bfe40edfe338b510d384be57f1 Mon Sep 17 00:00:00 2001
From: Gideon Israel Dsouza <gidisrael@gmail.com>
Date: Wed, 8 Feb 2017 10:27:50 +1100
Subject: compiler-gcc.h: add a new macro to wrap gcc attribute

Add __mode(x) into compiler-gcc.h as part of a cleanup task I've taken up,
to replace gcc specific attributes with macros.

The next patch is a cleanup of the m68k subsystem and it requires a new
macro to wrap __attribute__ ((mode (...)))

Link: http://lkml.kernel.org/r/1485540901-1988-2-git-send-email-gidisrael@gmail.com
Signed-off-by: Gideon Israel Dsouza <gidisrael@gmail.com>
Cc: Greg Ungerer <gerg@linux-m68k.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 0444b1336268..980fb35c8318 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -121,6 +121,7 @@
 #define __attribute_const__	__attribute__((__const__))
 #define __maybe_unused		__attribute__((unused))
 #define __always_unused		__attribute__((unused))
+#define __mode(x)               __attribute__((mode(x)))
 
 /* gcc version specific checks */
 
-- 
cgit v1.2.3


From 12df38e0b1b2439e4187fc7076c346589cd05a6a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 8 Feb 2017 10:27:51 +1100
Subject: bug: switch data corruption check to __must_check

The CHECK_DATA_CORRUPTION() macro was designed to have callers do
something meaningful/protective on failure.  However, using "return false"
in the macro too strictly limits the design patterns of callers.  Instead,
let callers handle the logic test directly, but make sure that the result
IS checked by forcing __must_check (which appears to not be able to be
used directly on macro expressions).

Link: http://lkml.kernel.org/r/20170206204547.GA125312@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/bug.h | 12 +++++++-----
 lib/list_debug.c    | 45 ++++++++++++++++++++++++---------------------
 2 files changed, 31 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index baff2e8fc8a8..5828489309bb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -124,18 +124,20 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 
 /*
  * Since detected data corruption should stop operation on the affected
- * structures, this returns false if the corruption condition is found.
+ * structures. Return value must be checked and sanely acted on by caller.
  */
+static inline __must_check bool check_data_corruption(bool v) { return v; }
 #define CHECK_DATA_CORRUPTION(condition, fmt, ...)			 \
-	do {								 \
-		if (unlikely(condition)) {				 \
+	check_data_corruption(({					 \
+		bool corruption = unlikely(condition);			 \
+		if (corruption) {					 \
 			if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \
 				pr_err(fmt, ##__VA_ARGS__);		 \
 				BUG();					 \
 			} else						 \
 				WARN(1, fmt, ##__VA_ARGS__);		 \
-			return false;					 \
 		}							 \
-	} while (0)
+		corruption;						 \
+	}))
 
 #endif	/* _LINUX_BUG_H */
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 7f7bfa55eb6d..a34db8d27667 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -20,15 +20,16 @@
 bool __list_add_valid(struct list_head *new, struct list_head *prev,
 		      struct list_head *next)
 {
-	CHECK_DATA_CORRUPTION(next->prev != prev,
-		"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	CHECK_DATA_CORRUPTION(prev->next != next,
-		"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	CHECK_DATA_CORRUPTION(new == prev || new == next,
-		"list_add double add: new=%p, prev=%p, next=%p.\n",
-		new, prev, next);
+	if (CHECK_DATA_CORRUPTION(next->prev != prev,
+			"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
+			prev, next->prev, next) ||
+	    CHECK_DATA_CORRUPTION(prev->next != next,
+			"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
+			next, prev->next, prev) ||
+	    CHECK_DATA_CORRUPTION(new == prev || new == next,
+			"list_add double add: new=%p, prev=%p, next=%p.\n",
+			new, prev, next))
+		return false;
 
 	return true;
 }
@@ -41,18 +42,20 @@ bool __list_del_entry_valid(struct list_head *entry)
 	prev = entry->prev;
 	next = entry->next;
 
-	CHECK_DATA_CORRUPTION(next == LIST_POISON1,
-		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-		entry, LIST_POISON1);
-	CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
-		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-		entry, LIST_POISON2);
-	CHECK_DATA_CORRUPTION(prev->next != entry,
-		"list_del corruption. prev->next should be %p, but was %p\n",
-		entry, prev->next);
-	CHECK_DATA_CORRUPTION(next->prev != entry,
-		"list_del corruption. next->prev should be %p, but was %p\n",
-		entry, next->prev);
+	if (CHECK_DATA_CORRUPTION(next == LIST_POISON1,
+			"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
+			entry, LIST_POISON1) ||
+	    CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
+			"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
+			entry, LIST_POISON2) ||
+	    CHECK_DATA_CORRUPTION(prev->next != entry,
+			"list_del corruption. prev->next should be %p, but was %p\n",
+			entry, prev->next) ||
+	    CHECK_DATA_CORRUPTION(next->prev != entry,
+			"list_del corruption. next->prev should be %p, but was %p\n",
+			entry, next->prev))
+		return false;
+
 	return true;
 
 }
-- 
cgit v1.2.3


From 8a23735f2cfa694c2639feb79f3cdc0bdd761859 Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Wed, 8 Feb 2017 10:27:54 +1100
Subject: linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While working on a thermal driver I encounter a scenario where the divisor
could be negative, instead of adding local code to handle this I though I
first try to add support for this in DIV_ROUND_CLOSEST.

Add support to DIV_ROUND_CLOSEST for negative divisors if both dividend
and divisor variable types are signed.  This should not alter current
behavior for users of the macro as previously negative divisors where not
supported.

Before:

DIV_ROUND_CLOSEST(  59,  4) =  15
DIV_ROUND_CLOSEST(  59, -4) = -14
DIV_ROUND_CLOSEST( -59,  4) = -15
DIV_ROUND_CLOSEST( -59, -4) =  14

After:

DIV_ROUND_CLOSEST(  59,  4) =  15
DIV_ROUND_CLOSEST(  59, -4) = -15
DIV_ROUND_CLOSEST( -59,  4) = -15
DIV_ROUND_CLOSEST( -59, -4) =  15

Link: http://lkml.kernel.org/r/20161222102217.29011-1-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kernel.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cb09238f6d32..b1a839b59add 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -100,16 +100,18 @@
 )
 
 /*
- * Divide positive or negative dividend by positive divisor and round
- * to closest integer. Result is undefined for negative divisors and
- * for negative dividends if the divisor variable type is unsigned.
+ * Divide positive or negative dividend by positive or negative divisor
+ * and round to closest integer. Result is undefined for negative
+ * divisors if dividends variable type is unsigned and for negative
+ * dividends if the divisor variable type is unsigned.
  */
 #define DIV_ROUND_CLOSEST(x, divisor)(			\
 {							\
 	typeof(x) __x = x;				\
 	typeof(divisor) __d = divisor;			\
 	(((typeof(x))-1) > 0 ||				\
-	 ((typeof(divisor))-1) > 0 || (__x) > 0) ?	\
+	 ((typeof(divisor))-1) > 0 ||			\
+	 (((__x) > 0) == ((__d) > 0))) ?		\
 		(((__x) + ((__d) / 2)) / (__d)) :	\
 		(((__x) - ((__d) / 2)) / (__d));	\
 }							\
-- 
cgit v1.2.3


From 870a99f8201b29f5ab5d2e2b245353bda14bccf8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 8 Feb 2017 10:27:55 +1100
Subject: linux-kernelh-fix-div_round_closest-to-support-negative-divisors-fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix comment, per Guenter

Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b1a839b59add..4c26dc3a8295 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -102,7 +102,7 @@
 /*
  * Divide positive or negative dividend by positive or negative divisor
  * and round to closest integer. Result is undefined for negative
- * divisors if dividends variable type is unsigned and for negative
+ * divisors if he dividend variable type is unsigned and for negative
  * dividends if the divisor variable type is unsigned.
  */
 #define DIV_ROUND_CLOSEST(x, divisor)(			\
-- 
cgit v1.2.3


From d0dc64ab17012c1085dd8eee435f1d117efc9ea5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 8 Feb 2017 10:27:55 +1100
Subject: rbtree: use designated initializers

Prepare to mark sensitive kernel structures for randomization by making
sure they're using designated initializers. These were identified during
allyesconfig builds of x86, arm, and arm64, with most initializer fixes
extracted from grsecurity.

Link: http://lkml.kernel.org/r/20161217010253.GA140470@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jie Chen <fykcee1@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rbtree_augmented.h | 4 +++-
 lib/rbtree.c                     | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index d076183e49be..9702b6e183bc 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -90,7 +90,9 @@ rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
 	old->rbaugmented = rbcompute(old);				\
 }									\
 rbstatic const struct rb_augment_callbacks rbname = {			\
-	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
+	.propagate = rbname ## _propagate,				\
+	.copy = rbname ## _copy,					\
+	.rotate = rbname ## _rotate					\
 };
 
 
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 1f8b112a7c35..4ba2828a67c0 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -427,7 +427,9 @@ static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
 static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
 
 static const struct rb_augment_callbacks dummy_callbacks = {
-	dummy_propagate, dummy_copy, dummy_rotate
+	.propagate = dummy_propagate,
+	.copy = dummy_copy,
+	.rotate = dummy_rotate
 };
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
-- 
cgit v1.2.3


From a290fb8a9144ccb3bd34be6ffd30c245dea7fabe Mon Sep 17 00:00:00 2001
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Date: Wed, 8 Feb 2017 10:27:57 +1100
Subject: lib: update LZ4 compressor module

Patch series "Update LZ4 compressor module", v7.

This patchset updates the LZ4 compression module to a version based on LZ4
v1.7.3 allowing to use the fast compression algorithm aka LZ4 fast which
provides an "acceleration" parameter as a tradeoff between high
compression ratio and high compression speed.

We want to use LZ4 fast in order to support compression in lustre and
(mostly, based on that) investigate data reduction techniques in behalf of
storage systems.

Also, it will be useful for other users of LZ4 compression, as with LZ4
fast it is possible to enable applications to use fast and/or high
compression depending on the usecase.  For instance, ZRAM is offering a
LZ4 backend and could benefit from an updated LZ4 in the kernel.

LZ4 homepage: http://www.lz4.org/
LZ4 source repository: https://github.com/lz4/lz4 Source version: 1.7.3

Benchmark (taken from [1], Core i5-4300U @1.9GHz):
----------------|--------------|----------------|----------
Compressor      | Compression  | Decompression  | Ratio
----------------|--------------|----------------|----------
memcpy          |  4200 MB/s   |  4200 MB/s     | 1.000
LZ4 fast 50     |  1080 MB/s   |  2650 MB/s     | 1.375
LZ4 fast 17     |   680 MB/s   |  2220 MB/s     | 1.607
LZ4 fast 5      |   475 MB/s   |  1920 MB/s     | 1.886
LZ4 default     |   385 MB/s   |  1850 MB/s     | 2.101

[1] http://fastcompression.blogspot.de/2015/04/sampling-or-faster-lz4.html

[PATCH 1/5] lib: Update LZ4 compressor module
[PATCH 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version
[PATCH 3/5] crypto: Change LZ4 modules to work with new LZ4 module version
[PATCH 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version
[PATCH 5/5] lib/lz4: Remove back-compat wrappers

This patch (of 5):

Update the LZ4 kernel module to LZ4 v1.7.3 by Yann Collet.  The kernel
module is inspired by the previous work by Chanho Min.  The updated LZ4
module will not break existing code since the patchset contains
appropriate changes.

API changes:

New method LZ4_compress_fast which differs from the variant available in
kernel by the new acceleration parameter, allowing to trade compression
ratio for more compression speed and vice versa.

LZ4_decompress_fast is the respective decompression method, featuring a
very fast decoder (multiple GB/s per core), able to reach RAM speed in
multi-core systems.  The decompressor allows to decompress data compressed
with LZ4 fast as well as the LZ4 HC (high compression) algorithm.

Also the useful functions LZ4_decompress_safe_partial and
LZ4_compress_destsize were added.  The latter reverses the logic by trying
to compress as much data as possible from source to dest while the former
aims to decompress partial blocks of data.

A bunch of streaming functions were also added which allow
compressig/decompressing data in multiple steps (so called "streaming
mode").

The methods lz4_compress and lz4_decompress_unknownoutputsize are now
known as LZ4_compress_default respectivley LZ4_decompress_safe.  The old
methods will be removed since there's no callers left in the code.

Link: http://lkml.kernel.org/r/1486321748-19085-2-git-send-email-4sschmid@informatik.uni-hamburg.de
Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Bongkyu Kim <bongkyu.kim@lge.com>
Cc: Rui Salvaterra <rsalvaterra@gmail.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/lz4.h      |  762 ++++++++++++++++++++++++++++---
 lib/lz4/lz4_compress.c   | 1114 ++++++++++++++++++++++++++++++++--------------
 lib/lz4/lz4_decompress.c |  696 ++++++++++++++++++-----------
 lib/lz4/lz4defs.h        |  334 ++++++++------
 lib/lz4/lz4hc_compress.c |  867 +++++++++++++++++++++++-------------
 5 files changed, 2690 insertions(+), 1083 deletions(-)

(limited to 'include')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 6b784c59f321..1b7ab2a43aed 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -1,87 +1,717 @@
-#ifndef __LZ4_H__
-#define __LZ4_H__
-/*
- * LZ4 Kernel Interface
+/* LZ4 Kernel Interface
  *
  * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ * Copyright (C) 2016, Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
+ *
+ * This file is based on the original header file
+ * for LZ4 - Fast LZ compression algorithm.
+ *
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2016, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  */
-#define LZ4_MEM_COMPRESS	(16384)
-#define LZ4HC_MEM_COMPRESS	(262144 + (2 * sizeof(unsigned char *)))
 
+#ifndef __LZ4_H__
+#define __LZ4_H__
+
+#include <linux/types.h>
+#include <linux/string.h>	 /* memset, memcpy */
+
+/*-************************************************************************
+ *	CONSTANTS
+ **************************************************************************/
 /*
- * lz4_compressbound()
- * Provides the maximum size that LZ4 may output in a "worst case" scenario
- * (input data not compressible)
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes
+ * (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
  */
-static inline size_t lz4_compressbound(size_t isize)
+#define LZ4_MEMORY_USAGE 14
+
+#define LZ4_MAX_INPUT_SIZE	0x7E000000 /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)	(\
+	(unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE \
+	? 0 \
+	: (isize) + ((isize)/255) + 16)
+
+#define LZ4_ACCELERATION_DEFAULT 1
+#define LZ4_HASHLOG	 (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)
+
+#define LZ4HC_MIN_CLEVEL			3
+#define LZ4HC_DEFAULT_CLEVEL			9
+#define LZ4HC_MAX_CLEVEL			16
+
+#define LZ4HC_DICTIONARY_LOGSIZE 16
+#define LZ4HC_MAXD (1<<LZ4HC_DICTIONARY_LOGSIZE)
+#define LZ4HC_MAXD_MASK (LZ4HC_MAXD - 1)
+#define LZ4HC_HASH_LOG (LZ4HC_DICTIONARY_LOGSIZE - 1)
+#define LZ4HC_HASHTABLESIZE (1 << LZ4HC_HASH_LOG)
+#define LZ4HC_HASH_MASK (LZ4HC_HASHTABLESIZE - 1)
+
+/*-************************************************************************
+ *	STREAMING CONSTANTS AND STRUCTURES
+ **************************************************************************/
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE	(LZ4_STREAMSIZE_U64 * sizeof(unsigned long long))
+
+#define LZ4_STREAMHCSIZE        262192
+#define LZ4_STREAMHCSIZE_SIZET (262192 / sizeof(size_t))
+
+#define LZ4_STREAMDECODESIZE_U64	4
+#define LZ4_STREAMDECODESIZE		 (LZ4_STREAMDECODESIZE_U64 * \
+	sizeof(unsigned long long))
+
+/*
+ * LZ4_stream_t - information structure to track an LZ4 stream.
+ */
+typedef struct {
+	uint32_t hashTable[LZ4_HASH_SIZE_U32];
+	uint32_t currentOffset;
+	uint32_t initCheck;
+	const uint8_t *dictionary;
+	uint8_t *bufferStart;
+	uint32_t dictSize;
+} LZ4_stream_t_internal;
+typedef union {
+	unsigned long long table[LZ4_STREAMSIZE_U64];
+	LZ4_stream_t_internal internal_donotuse;
+} LZ4_stream_t;
+
+/*
+ * LZ4_streamHC_t - information structure to track an LZ4HC stream.
+ */
+typedef struct {
+	unsigned int	 hashTable[LZ4HC_HASHTABLESIZE];
+	unsigned short	 chainTable[LZ4HC_MAXD];
+	/* next block to continue on current prefix */
+	const unsigned char *end;
+	/* All index relative to this position */
+	const unsigned char *base;
+	/* alternate base for extDict */
+	const unsigned char *dictBase;
+	/* below that point, need extDict */
+	unsigned int	 dictLimit;
+	/* below that point, no more dict */
+	unsigned int	 lowLimit;
+	/* index from which to continue dict update */
+	unsigned int	 nextToUpdate;
+	unsigned int	 compressionLevel;
+} LZ4HC_CCtx_internal;
+typedef union {
+	size_t table[LZ4_STREAMHCSIZE_SIZET];
+	LZ4HC_CCtx_internal internal_donotuse;
+} LZ4_streamHC_t;
+
+/*
+ * LZ4_streamDecode_t - information structure to track an
+ *	LZ4 stream during decompression.
+ *
+ * init this structure using LZ4_setStreamDecode (or memset()) before first use
+ */
+typedef struct {
+	const uint8_t *externalDict;
+	size_t extDictSize;
+	const uint8_t *prefixEnd;
+	size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+typedef union {
+	unsigned long long table[LZ4_STREAMDECODESIZE_U64];
+	LZ4_streamDecode_t_internal internal_donotuse;
+} LZ4_streamDecode_t;
+
+/*-************************************************************************
+ *	SIZE OF STATE
+ **************************************************************************/
+#define LZ4_MEM_COMPRESS	LZ4_STREAMSIZE
+#define LZ4HC_MEM_COMPRESS	LZ4_STREAMHCSIZE
+
+/*-************************************************************************
+ *	Compression Functions
+ **************************************************************************/
+
+/**
+ * LZ4_compressBound() - Max. output size in worst case szenarios
+ * @isize: Size of the input data
+ *
+ * Return: Max. size LZ4 may output in a "worst case" szenario
+ * (data not compressible)
+ */
+static inline int LZ4_compressBound(size_t isize)
 {
-	return isize + (isize / 255) + 16;
+	return LZ4_COMPRESSBOUND(isize);
 }
 
-/*
- * lz4_compress()
- *	src     : source address of the original data
- *	src_len : size of the original data
- *	dst	: output buffer address of the compressed data
- *		This requires 'dst' of size LZ4_COMPRESSBOUND.
- *	dst_len : is the output size, which is returned after compress done
- *	workmem : address of the working memory.
- *		This requires 'workmem' of size LZ4_MEM_COMPRESS.
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer and workmem must be already allocated with
- *		the defined size.
- */
-int lz4_compress(const unsigned char *src, size_t src_len,
-		unsigned char *dst, size_t *dst_len, void *wrkmem);
-
- /*
-  * lz4hc_compress()
-  *	 src	 : source address of the original data
-  *	 src_len : size of the original data
-  *	 dst	 : output buffer address of the compressed data
-  *		This requires 'dst' of size LZ4_COMPRESSBOUND.
-  *	 dst_len : is the output size, which is returned after compress done
-  *	 workmem : address of the working memory.
-  *		This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
-  *	 return  : Success if return 0
-  *		   Error if return (< 0)
-  *	 note :  Destination buffer and workmem must be already allocated with
-  *		 the defined size.
-  */
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-		unsigned char *dst, size_t *dst_len, void *wrkmem);
+/**
+ * lz4_compressbound() - For backwards compatibility; see LZ4_compressBound
+ * @isize: Size of the input data
+ *
+ * Return: Max. size LZ4 may output in a "worst case" szenario
+ *	(data not compressible)
+ */
+static inline int lz4_compressbound(size_t isize)
+{
+	return LZ4_COMPRESSBOUND(isize);
+}
+
+/**
+ * LZ4_compress_default() - Compress data from source to dest
+ * @source: source address of the original data
+ * @dest: output buffer address of the compressed data
+ * @inputSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxOutputSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of LZ4_MEM_COMPRESS.
+ *
+ * Compresses 'sourceSize' bytes from buffer 'source'
+ * into already allocated 'dest' buffer of size 'maxOutputSize'.
+ * Compression is guaranteed to succeed if
+ * 'maxOutputSize' >= LZ4_compressBound(inputSize).
+ * It also runs faster, so it's a recommended setting.
+ * If the function cannot compress 'source' into a more limited 'dest' budget,
+ * compression stops *immediately*, and the function result is zero.
+ * As a consequence, 'dest' content is not valid.
+ *
+ * Return: Number of bytes written into buffer 'dest'
+ *	(necessarily <= maxOutputSize) or 0 if compression fails
+ */
+int LZ4_compress_default(const char *source, char *dest, int inputSize,
+	int maxOutputSize, void *wrkmem);
+
+/**
+ * LZ4_compress_fast() - As LZ4_compress_default providing an acceleration param
+ * @source: source address of the original data
+ * @dest: output buffer address of the compressed data
+ * @inputSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxOutputSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ * @acceleration: acceleration factor
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of LZ4_MEM_COMPRESS.
+ *
+ * Same as LZ4_compress_default(), but allows to select an "acceleration"
+ * factor. The larger the acceleration value, the faster the algorithm,
+ * but also the lesser the compression. It's a trade-off. It can be fine tuned,
+ * with each successive value providing roughly +~3% to speed.
+ * An acceleration value of "1" is the same as regular LZ4_compress_default()
+ * Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT, which is 1.
+ *
+ * Return: Number of bytes written into buffer 'dest'
+ *	(necessarily <= maxOutputSize) or 0 if compression fails
+ */
+int LZ4_compress_fast(const char *source, char *dest, int inputSize,
+	int maxOutputSize, int acceleration, void *wrkmem);
+
+/**
+ * LZ4_compress_destSize() - Compress as much data as possible
+ *	from source to dest
+ * @source: source address of the original data
+ * @dest: output buffer address of the compressed data
+ * @sourceSizePtr: will be modified to indicate how many bytes where read
+ *	from 'source' to fill 'dest'. New value is necessarily <= old value.
+ * @targetDestSize: Size of buffer 'dest' which must be already allocated
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of LZ4_MEM_COMPRESS.
+ *
+ * Reverse the logic, by compressing as much data as possible
+ * from 'source' buffer into already allocated buffer 'dest'
+ * of size 'targetDestSize'.
+ * This function either compresses the entire 'source' content into 'dest'
+ * if it's large enough, or fill 'dest' buffer completely with as much data as
+ * possible from 'source'.
+ *
+ * Return: Number of bytes written into 'dest' (necessarily <= targetDestSize)
+ *	or 0 if compression fails
+ */
+int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr,
+	int targetDestSize, void *wrkmem);
 
 /*
- * lz4_decompress()
- *	src     : source address of the compressed data
- *	src_len : is the input size, whcih is returned after decompress done
- *	dest	: output buffer address of the decompressed data
- *	actual_dest_len: is the size of uncompressed data, supposing it's known
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer must be already allocated.
- *		slightly faster than lz4_decompress_unknownoutputsize()
+ * lz4_compress() - For backward compatibility, see LZ4_compress_default
+ * @src: source address of the original data
+ * @src_len: size of the original data
+ * @dst: output buffer address of the compressed data. This requires 'dst'
+ *	of size LZ4_COMPRESSBOUND
+ * @dst_len: is the output size, which is returned after compress done
+ * @workmem: address of the working memory.
+ *
+ * Return: Success if return 0, Error if return < 0
  */
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-		unsigned char *dest, size_t actual_dest_len);
+int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
+	size_t *dst_len, void *wrkmem);
+
+/*-************************************************************************
+ *	Decompression Functions
+ **************************************************************************/
+
+/**
+ * LZ4_decompress_fast() - Decompresses data from 'source' into 'dest'
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated with 'originalSize' bytes
+ * @originalSize: is the original and therefore uncompressed size
+ *
+ * Decompresses data from 'source' into 'dest'.
+ * This function fully respect memory boundaries for properly formed
+ * compressed data.
+ * It is a bit faster than LZ4_decompress_safe().
+ * However, it does not provide any protection against intentionally
+ * modified data stream (malicious input).
+ * Use this function in trusted environment only
+ * (data to decode comes from a trusted source).
+ *
+ * Return: number of bytes read from the source buffer
+ *	or a negative result if decompression fails.
+ */
+int LZ4_decompress_fast(const char *source, char *dest, int originalSize);
+
+/**
+ * LZ4_decompress_safe() - Decompression protected against buffer overflow
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated
+ * @compressedSize: is the precise full size of the compressed block
+ * @maxDecompressedSize: is the size of 'dest' buffer
+ *
+ * Decompresses data fom 'source' into 'dest'.
+ * If the source stream is detected malformed, the function will
+ * stop decoding and return a negative result.
+ * This function is protected against buffer overflow exploits,
+ * including malicious data packets. It never writes outside output buffer,
+ * nor reads outside input buffer.
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_safe(const char *source, char *dest, int compressedSize,
+	int maxDecompressedSize);
+
+/**
+ * LZ4_decompress_safe_partial() - Decompress a block of size 'compressedSize'
+ *	at position 'source' into buffer 'dest'
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the decompressed data which must be
+ *	already allocated
+ * @compressedSize: is the precise full size of the compressed block.
+ * @targetOutputSize: the decompression operation will try
+ *	to stop as soon as 'targetOutputSize' has been reached
+ * @maxDecompressedSize: is the size of destination buffer
+ *
+ * This function decompresses a compressed block of size 'compressedSize'
+ * at position 'source' into destination buffer 'dest'
+ * of size 'maxDecompressedSize'.
+ * The function tries to stop decompressing operation as soon as
+ * 'targetOutputSize' has been reached, reducing decompression time.
+ * This function never writes outside of output buffer,
+ * and never reads outside of input buffer.
+ * It is therefore protected against malicious data packets.
+ *
+ * Return: the number of bytes decoded in the destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ *
+ */
+int LZ4_decompress_safe_partial(const char *source, char *dest,
+	int compressedSize, int targetOutputSize, int maxDecompressedSize);
 
 /*
- * lz4_decompress_unknownoutputsize()
- *	src     : source address of the compressed data
- *	src_len : is the input size, therefore the compressed size
- *	dest	: output buffer address of the decompressed data
- *	dest_len: is the max size of the destination buffer, which is
- *			returned with actual size of decompressed data after
- *			decompress done
- *	return  : Success if return 0
- *		  Error if return (< 0)
- *	note :  Destination buffer must be already allocated.
+ * lz4_decompress_unknownoutputsize() - For backwards compatibility,
+ *	see LZ4_decompress_safe
+ * @src: source address of the compressed data
+ * @src_len: is the input size, therefore the compressed size
+ * @dest: output buffer address of the decompressed data
+ *	which must be already allocated
+ * @dest_len: is the max size of the destination buffer, which is
+ *	returned with actual size of decompressed data after decompress done
+ *
+ * Return: Success if return 0, Error if return (< 0)
  */
 int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-		unsigned char *dest, size_t *dest_len);
+	unsigned char *dest, size_t *dest_len);
+
+/**
+ * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast
+ * @src: source address of the compressed data
+ * @src_len: is the input size, which is returned after decompress done
+ * @dest: output buffer address of the decompressed data,
+ *	which must be already allocated
+ * @actual_dest_len: is the size of uncompressed data, supposing it's known
+ *
+ * Return: Success if return 0, Error if return (< 0)
+ */
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+	unsigned char *dest, size_t actual_dest_len);
+
+/*-************************************************************************
+ *	LZ4 HC Compression
+ **************************************************************************/
+
+/**
+ * LZ4_compress_HC() - Compress data from `src` into `dst`, using HC algorithm
+ * @src: source address of the original data
+ * @dst: output buffer address of the compressed data
+ * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @dstCapacity: full or partial size of buffer 'dst',
+ *	which must be already allocated
+ * @compressionLevel: Recommended values are between 4 and 9, although any
+ *	value between 1 and LZ4HC_MAX_CLEVEL will work.
+ *	Values >LZ4HC_MAX_CLEVEL behave the same as 16.
+ * @wrkmem: address of the working memory.
+ *	This requires 'wrkmem' of size LZ4HC_MEM_COMPRESS.
+ *
+ * Compress data from 'src' into 'dst', using the more powerful
+ * but slower "HC" algorithm. Compression is guaranteed to succeed if
+ * `dstCapacity >= LZ4_compressBound(srcSize)
+ *
+ * Return : the number of bytes written into 'dst' or 0 if compression fails.
+ */
+int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity,
+	int compressionLevel, void *wrkmem);
+
+/**
+ * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC
+ * @src: source address of the original data
+ * @src_len: size of the original data
+ * @dst: output buffer address of the compressed data. This requires 'dst'
+ *	of size LZ4_COMPRESSBOUND.
+ * @dst_len: is the output size, which is returned after compress done
+ * @wrkmem: address of the working memory.
+ *	This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
+ *
+ * Return	: Success if return 0, Error if return (< 0)
+ */
+int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
+	size_t *dst_len, void *wrkmem);
+
+/**
+ * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure
+ * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure
+ * @compressionLevel: Recommended values are between 4 and 9, although any
+ *	value between 1 and LZ4HC_MAX_CLEVEL will work.
+ *	Values >LZ4HC_MAX_CLEVEL behave the same as 16.
+ *
+ * An LZ4_streamHC_t structure can be allocated once
+ * and re-used multiple times.
+ * Use this function to init an allocated `LZ4_streamHC_t` structure
+ * and start a new compression.
+ */
+void LZ4_resetStreamHC(LZ4_streamHC_t *streamHCPtr, int compressionLevel);
+
+/**
+ * LZ4_loadDictHC() - Load a static dictionary into LZ4_streamHC
+ * @streamHCPtr: pointer to the LZ4HC_stream_t
+ * @dictionary: dictionary to load
+ * @dictSize: size of dictionary
+ *
+ * Use this function to load a static dictionary into LZ4HC_stream.
+ * Any previous data will be forgotten, only 'dictionary'
+ * will remain in memory.
+ * Loading a size of 0 is allowed.
+ *
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int	LZ4_loadDictHC(LZ4_streamHC_t *streamHCPtr, const char *dictionary,
+	int dictSize);
+
+/**
+ * LZ4_compress_HC_continue() - Compress 'src' using data from previously
+ *	compressed blocks as a dictionary using the HC algorithm
+ * @streamHCPtr: Pointer to the previous 'LZ4_streamHC_t' structure
+ * @src: source address of the original data
+ * @dst: output buffer address of the compressed data,
+ *	which must be already allocated
+ * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxDstSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ *
+ * These functions compress data in successive blocks of any size, using
+ * previous blocks as dictionary. One key assumption is that previous
+ * blocks (up to 64 KB) remain read-accessible while
+ * compressing next blocks. There is an exception for ring buffers,
+ * which can be smaller than 64 KB.
+ * Ring buffers scenario is automatically detected and handled by
+ * LZ4_compress_HC_continue().
+ * Before starting compression, state must be properly initialized,
+ * using LZ4_resetStreamHC().
+ * A first "fictional block" can then be designated as
+ * initial dictionary, using LZ4_loadDictHC() (Optional).
+ * Then, use LZ4_compress_HC_continue()
+ * to compress each successive block. Previous memory blocks
+ * (including initial dictionary when present) must remain accessible
+ * and unmodified during compression.
+ * 'dst' buffer should be sized to handle worst case scenarios, using
+ *  LZ4_compressBound(), to ensure operation success.
+ *  If, for any reason, previous data blocks can't be preserved unmodified
+ *  in memory during next compression block,
+ *  you must save it to a safer memory space, using LZ4_saveDictHC().
+ * Return value of LZ4_saveDictHC() is the size of dictionary
+ * effectively saved into 'safeBuffer'.
+ *
+ * Return: Number of bytes written into buffer 'dst'  or 0 if compression fails
+ */
+int LZ4_compress_HC_continue(LZ4_streamHC_t *streamHCPtr, const char *src,
+	char *dst, int srcSize, int maxDstSize);
+
+/**
+ * LZ4_saveDictHC() - Save static dictionary from LZ4HC_stream
+ * @streamHCPtr: pointer to the 'LZ4HC_stream_t' structure
+ * @safeBuffer: buffer to save dictionary to, must be already allocated
+ * @maxDictSize: size of 'safeBuffer'
+ *
+ * If previously compressed data block is not guaranteed
+ * to remain available at its memory location,
+ * save it into a safer place (char *safeBuffer).
+ * Note : you don't need to call LZ4_loadDictHC() afterwards,
+ * dictionary is immediately usable, you can therefore call
+ * LZ4_compress_HC_continue().
+ *
+ * Return : saved dictionary size in bytes (necessarily <= maxDictSize),
+ *	or 0 if error.
+ */
+int LZ4_saveDictHC(LZ4_streamHC_t *streamHCPtr, char *safeBuffer,
+	int maxDictSize);
+
+/*-*********************************************
+ *	Streaming Compression Functions
+ ***********************************************/
+
+/**
+ * LZ4_resetStream() - Init an allocated 'LZ4_stream_t' structure
+ * @LZ4_stream: pointer to the 'LZ4_stream_t' structure
+ *
+ * An LZ4_stream_t structure can be allocated once
+ * and re-used multiple times.
+ * Use this function to init an allocated `LZ4_stream_t` structure
+ * and start a new compression.
+ */
+void LZ4_resetStream(LZ4_stream_t *LZ4_stream);
+
+/**
+ * LZ4_loadDict() - Load a static dictionary into LZ4_stream
+ * @streamPtr: pointer to the LZ4_stream_t
+ * @dictionary: dictionary to load
+ * @dictSize: size of dictionary
+ *
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary'
+ * will remain in memory.
+ * Loading a size of 0 is allowed.
+ *
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int LZ4_loadDict(LZ4_stream_t *streamPtr, const char *dictionary,
+	int dictSize);
+
+/**
+ * LZ4_saveDict() - Save static dictionary from LZ4_stream
+ * @streamPtr: pointer to the 'LZ4_stream_t' structure
+ * @safeBuffer: buffer to save dictionary to, must be already allocated
+ * @dictSize: size of 'safeBuffer'
+ *
+ * If previously compressed data block is not guaranteed
+ * to remain available at its memory location,
+ * save it into a safer place (char *safeBuffer).
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ * dictionary is immediately usable, you can therefore call
+ * LZ4_compress_fast_continue().
+ *
+ * Return : saved dictionary size in bytes (necessarily <= dictSize),
+ *	or 0 if error.
+ */
+int LZ4_saveDict(LZ4_stream_t *streamPtr, char *safeBuffer, int dictSize);
+
+/**
+ * LZ4_compress_fast_continue() - Compress 'src' using data from previously
+ *	compressed blocks as a dictionary
+ * @streamPtr: Pointer to the previous 'LZ4_stream_t' structure
+ * @src: source address of the original data
+ * @dst: output buffer address of the compressed data,
+ *	which must be already allocated
+ * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE
+ * @maxDstSize: full or partial size of buffer 'dest'
+ *	which must be already allocated
+ * @acceleration: acceleration factor
+ *
+ * Compress buffer content 'src', using data from previously compressed blocks
+ * as dictionary to improve compression ratio.
+ * Important : Previous data blocks are assumed to still
+ * be present and unmodified !
+ * If maxDstSize >= LZ4_compressBound(srcSize),
+ * compression is guaranteed to succeed, and runs faster.
+ *
+ * Return: Number of bytes written into buffer 'dst'  or 0 if compression fails
+ */
+int LZ4_compress_fast_continue(LZ4_stream_t *streamPtr, const char *src,
+	char *dst, int srcSize, int maxDstSize, int acceleration);
+
+/**
+ * LZ4_setStreamDecode() - Instruct where to find dictionary
+ * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
+ * @dictionary: dictionary to use
+ * @dictSize: size of dictionary
+ *
+ * Use this function to instruct where to find the dictionary.
+ *	Setting a size of 0 is allowed (same effect as reset).
+ *
+ * Return: 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *dictionary, int dictSize);
+
+/**
+ * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated
+ * @compressedSize: is the precise full size of the compressed block
+ * @maxDecompressedSize: is the size of 'dest' buffer
+ *
+ * These decoding function allows decompression of multiple blocks
+ * in "streaming" mode.
+ * Previously decoded blocks *must* remain available at the memory position
+ * where they were decoded (up to 64 KB)
+ * In the case of a ring buffers, decoding buffer must be either :
+ *    - Exactly same size as encoding buffer, with same update rule
+ *      (block boundaries at same positions) In which case,
+ *      the decoding & encoding ring buffer can have any size,
+ *      including very small ones ( < 64 KB).
+ *    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *      maxBlockSize is implementation dependent.
+ *      It's the maximum size you intend to compress into a single block.
+ *      In which case, encoding and decoding buffers do not need
+ *      to be synchronized, and encoding ring buffer can have any size,
+ *      including small ones ( < 64 KB).
+ *    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+ *      In which case, encoding and decoding buffers do not need to be
+ *      synchronized, and encoding ring buffer can have any size,
+ *      including larger than decoding buffer. W
+ * Whenever these conditions are not possible, save the last 64KB of decoded
+ * data into a safe buffer, and indicate where it is saved
+ * using LZ4_setStreamDecode()
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int compressedSize,
+	int maxDecompressedSize);
+
+/**
+ * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated with 'originalSize' bytes
+ * @originalSize: is the original and therefore uncompressed size
+ *
+ * These decoding function allows decompression of multiple blocks
+ * in "streaming" mode.
+ * Previously decoded blocks *must* remain available at the memory position
+ * where they were decoded (up to 64 KB)
+ * In the case of a ring buffers, decoding buffer must be either :
+ *    - Exactly same size as encoding buffer, with same update rule
+ *      (block boundaries at same positions) In which case,
+ *      the decoding & encoding ring buffer can have any size,
+ *      including very small ones ( < 64 KB).
+ *    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *      maxBlockSize is implementation dependent.
+ *      It's the maximum size you intend to compress into a single block.
+ *      In which case, encoding and decoding buffers do not need
+ *      to be synchronized, and encoding ring buffer can have any size,
+ *      including small ones ( < 64 KB).
+ *    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+ *      In which case, encoding and decoding buffers do not need to be
+ *      synchronized, and encoding ring buffer can have any size,
+ *      including larger than decoding buffer. W
+ * Whenever these conditions are not possible, save the last 64KB of decoded
+ * data into a safe buffer, and indicate where it is saved
+ * using LZ4_setStreamDecode()
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int originalSize);
+
+/**
+ * LZ4_decompress_safe_usingDict() - Same as LZ4_setStreamDecode()
+ *	followed by LZ4_decompress_safe_continue()
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated
+ * @compressedSize: is the precise full size of the compressed block
+ * @maxDecompressedSize: is the size of 'dest' buffer
+ * @dictStart: pointer to the start of the dictionary in memory
+ * @dictSize: size of dictionary
+ *
+ * These decoding function works the same as
+ * a combination of LZ4_setStreamDecode() followed by
+ * LZ4_decompress_safe_continue()
+ * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_safe_usingDict(const char *source, char *dest,
+	int compressedSize, int maxDecompressedSize, const char *dictStart,
+	int dictSize);
+
+/**
+ * LZ4_decompress_fast_usingDict() - Same as LZ4_setStreamDecode()
+ *	followed by LZ4_decompress_fast_continue()
+ * @source: source address of the compressed data
+ * @dest: output buffer address of the uncompressed data
+ *	which must be already allocated with 'originalSize' bytes
+ * @originalSize: is the original and therefore uncompressed size
+ * @dictStart: pointer to the start of the dictionary in memory
+ * @dictSize: size of dictionary
+ *
+ * These decoding function works the same as
+ * a combination of LZ4_setStreamDecode() followed by
+ * LZ4_decompress_safe_continue()
+ * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ *
+ * Return: number of bytes decompressed into destination buffer
+ *	(necessarily <= maxDecompressedSize)
+ *	or a negative result in case of error
+ */
+int LZ4_decompress_fast_usingDict(const char *source, char *dest,
+	int originalSize, const char *dictStart, int dictSize);
+
 #endif
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index 28321d8f75ef..6aa7ac381e1a 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -1,19 +1,16 @@
 /*
  * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
+ * Copyright (C) 2011 - 2016, Yann Collet.
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
- *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -25,417 +22,886 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- *  Changed for kernel use by:
- *  Chanho Min <chanho.min@lge.com>
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
+/*-************************************
+ *	Dependencies
+ **************************************/
+#include <linux/lz4.h>
+#include "lz4defs.h"
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/lz4.h>
 #include <asm/unaligned.h>
-#include "lz4defs.h"
 
-/*
- * LZ4_compressCtx :
- * -----------------
- * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
- * maximum size 'maxOutputSize'.  * If it cannot achieve it, compression
- * will stop, and result of the function will be zero.
- * return : the number of bytes written in buffer 'dest', or 0 if the
- * compression fails
- */
-static inline int lz4_compressctx(void *ctx,
-		const char *source,
-		char *dest,
-		int isize,
-		int maxoutputsize)
+/*-******************************
+ *	Compression functions
+ ********************************/
+static U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
 {
-	HTYPE *hashtable = (HTYPE *)ctx;
-	const u8 *ip = (u8 *)source;
+	if (tableType == byU16)
+		return ((sequence * 2654435761U)
+			>> ((MINMATCH*8) - (LZ4_HASHLOG + 1)));
+	else
+		return ((sequence * 2654435761U)
+			>> ((MINMATCH*8) - LZ4_HASHLOG));
+}
+
 #if LZ4_ARCH64
-	const BYTE * const base = ip;
+static U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+	const U32 hashLog = (tableType == byU16)
+		? LZ4_HASHLOG + 1
+		: LZ4_HASHLOG;
+
+#ifdef __LITTLE_ENDIAN__
+	static const U64 prime5bytes = 889523592379ULL;
+
+	return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
 #else
-	const int base = 0;
+	static const U64 prime8bytes = 11400714785074694791ULL;
+
+	return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
 #endif
-	const u8 *anchor = ip;
-	const u8 *const iend = ip + isize;
-	const u8 *const mflimit = iend - MFLIMIT;
-	#define MATCHLIMIT (iend - LASTLITERALS)
-
-	u8 *op = (u8 *) dest;
-	u8 *const oend = op + maxoutputsize;
-	int length;
-	const int skipstrength = SKIPSTRENGTH;
-	u32 forwardh;
-	int lastrun;
-
-	/* Init */
-	if (isize < MINLENGTH)
-		goto _last_literals;
+}
+#endif
+
+static U32 LZ4_hashPosition(const void *p, tableType_t tableType)
+{
+#if LZ4_ARCH64
+	if (tableType == byU32)
+		return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+#endif
+
+	return LZ4_hash4(LZ4_read32(p), tableType);
+}
+
+static void LZ4_putPositionOnHash(const BYTE *p, U32 h, void *tableBase,
+	tableType_t const tableType, const BYTE *srcBase)
+{
+	switch (tableType) {
+	case byPtr:
+	{
+		const BYTE **hashTable = (const BYTE **)tableBase;
+
+		hashTable[h] = p;
+		return;
+	}
+	case byU32:
+	{
+		U32 *hashTable = (U32 *) tableBase;
+
+		hashTable[h] = (U32)(p - srcBase);
+		return;
+	}
+	case byU16:
+	{
+		U16 *hashTable = (U16 *) tableBase;
+
+		hashTable[h] = (U16)(p - srcBase);
+		return;
+	}
+	}
+}
+
+static inline void LZ4_putPosition(const BYTE *p, void *tableBase,
+	tableType_t tableType, const BYTE *srcBase)
+{
+	U32 const h = LZ4_hashPosition(p, tableType);
+
+	LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE *LZ4_getPositionOnHash(U32 h, void *tableBase,
+	tableType_t tableType, const BYTE *srcBase)
+{
+	if (tableType == byPtr) {
+		const BYTE **hashTable = (const BYTE **) tableBase;
+
+		return hashTable[h];
+	}
+
+	if (tableType == byU32) {
+		const U32 * const hashTable = (U32 *) tableBase;
+
+		return hashTable[h] + srcBase;
+	}
+
+	{
+		/* default, to ensure a return */
+		const U16 * const hashTable = (U16 *) tableBase;
+		return hashTable[h] + srcBase;
+	}
+}
+
+static inline const BYTE *LZ4_getPosition(const BYTE *p, void *tableBase,
+	tableType_t tableType, const BYTE *srcBase)
+{
+	U32 const h = LZ4_hashPosition(p, tableType);
+
+	return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+
+/*
+ * LZ4_compress_generic() :
+ * inlined, to ensure branches are decided at compilation time
+ */
+static inline int LZ4_compress_generic(
+	LZ4_stream_t_internal * const dictPtr,
+	const char * const source,
+	char * const dest,
+	const int inputSize,
+	const int maxOutputSize,
+	const limitedOutput_directive outputLimited,
+	const tableType_t tableType,
+	const dict_directive dict,
+	const dictIssue_directive dictIssue,
+	const U32 acceleration)
+{
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *base;
+	const BYTE *lowLimit;
+	const BYTE * const lowRefLimit = ip - dictPtr->dictSize;
+	const BYTE * const dictionary = dictPtr->dictionary;
+	const BYTE * const dictEnd = dictionary + dictPtr->dictSize;
+	const size_t dictDelta = dictEnd - (const BYTE *)source;
+	const BYTE *anchor = (const BYTE *) source;
+	const BYTE * const iend = ip + inputSize;
+	const BYTE * const mflimit = iend - MFLIMIT;
+	const BYTE * const matchlimit = iend - LASTLITERALS;
+
+	BYTE *op = (BYTE *) dest;
+	BYTE * const olimit = op + maxOutputSize;
+
+	U32 forwardH;
+	size_t refDelta = 0;
+
+	/* Init conditions */
+	if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) {
+		/* Unsupported inputSize, too large (or negative) */
+		return 0;
+	}
+	switch (dict) {
+	case noDict:
+	default:
+		base = (const BYTE *)source;
+		lowLimit = (const BYTE *)source;
+		break;
+	case withPrefix64k:
+		base = (const BYTE *)source - dictPtr->currentOffset;
+		lowLimit = (const BYTE *)source - dictPtr->dictSize;
+		break;
+	case usingExtDict:
+		base = (const BYTE *)source - dictPtr->currentOffset;
+		lowLimit = (const BYTE *)source;
+		break;
+	}
+
+	if ((tableType == byU16)
+		&& (inputSize >= LZ4_64Klimit)) {
+		/* Size too large (not within 64K limit) */
+		return 0;
+	}
 
-	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+	if (inputSize < LZ4_minLength) {
+		/* Input too small, no compression (all literals) */
+		goto _last_literals;
+	}
 
 	/* First Byte */
-	hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
-	ip++;
-	forwardh = LZ4_HASH_VALUE(ip);
+	LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
+	ip++; forwardH = LZ4_hashPosition(ip, tableType);
 
 	/* Main Loop */
-	for (;;) {
-		int findmatchattempts = (1U << skipstrength) + 3;
-		const u8 *forwardip = ip;
-		const u8 *ref;
-		u8 *token;
+	for ( ; ; ) {
+		const BYTE *match;
+		BYTE *token;
 
 		/* Find a match */
-		do {
-			u32 h = forwardh;
-			int step = findmatchattempts++ >> skipstrength;
-			ip = forwardip;
-			forwardip = ip + step;
-
-			if (unlikely(forwardip > mflimit))
-				goto _last_literals;
-
-			forwardh = LZ4_HASH_VALUE(forwardip);
-			ref = base + hashtable[h];
-			hashtable[h] = ip - base;
-		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+		{
+			const BYTE *forwardIp = ip;
+			unsigned int step = 1;
+			unsigned int searchMatchNb = acceleration
+				<< LZ4_skipTrigger;
+
+			do {
+				U32 const h = forwardH;
+
+				ip = forwardIp;
+				forwardIp += step;
+				step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+				if (unlikely(forwardIp > mflimit))
+					goto _last_literals;
+
+				match = LZ4_getPositionOnHash(h,
+					dictPtr->hashTable,
+					tableType, base);
+				if (dict == usingExtDict) {
+					if (match < (const BYTE *)source) {
+						refDelta = dictDelta;
+						lowLimit = dictionary;
+					} else {
+						refDelta = 0;
+						lowLimit = (const BYTE *)source;
+				}	 }
+				forwardH = LZ4_hashPosition(forwardIp,
+					tableType);
+				LZ4_putPositionOnHash(ip, h, dictPtr->hashTable,
+					tableType, base);
+
+			} while (((dictIssue == dictSmall)
+					? (match < lowRefLimit)
+					: 0)
+				|| ((tableType == byU16)
+					? 0
+					: (match + MAX_DISTANCE < ip))
+				|| (LZ4_read32(match + refDelta)
+					!= LZ4_read32(ip)));
+		}
 
 		/* Catch up */
-		while ((ip > anchor) && (ref > (u8 *)source) &&
-			unlikely(ip[-1] == ref[-1])) {
+		while (((ip > anchor) & (match + refDelta > lowLimit))
+			&& (unlikely(ip[-1] == match[refDelta - 1]))) {
 			ip--;
-			ref--;
-		}
+			match--;
+			}
 
-		/* Encode Literal length */
-		length = (int)(ip - anchor);
-		token = op++;
-		/* check output limit */
-		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
-			(length >> 8) > oend))
-			return 0;
+		/* Encode Literals */
+		{
+			unsigned const int litLength = (unsigned int)(ip - anchor);
 
-		if (length >= (int)RUN_MASK) {
-			int len;
-			*token = (RUN_MASK << ML_BITS);
-			len = length - RUN_MASK;
-			for (; len > 254 ; len -= 255)
-				*op++ = 255;
-			*op++ = (u8)len;
-		} else
-			*token = (length << ML_BITS);
+			token = op++;
+			if ((outputLimited) &&
+				/* Check output buffer overflow */
+				(unlikely(op + litLength +
+					(2 + 1 + LASTLITERALS) +
+					(litLength/255) > olimit)))
+				return 0;
+			if (litLength >= RUN_MASK) {
+				int len = (int)litLength - RUN_MASK;
+
+				*token = (RUN_MASK<<ML_BITS);
+				for (; len >= 255 ; len -= 255)
+					*op++ = 255;
+				*op++ = (BYTE)len;
+			} else
+				*token = (BYTE)(litLength<<ML_BITS);
+
+			/* Copy Literals */
+			LZ4_wildCopy(op, anchor, op + litLength);
+			op += litLength;
+		}
 
-		/* Copy Literals */
-		LZ4_BLINDCOPY(anchor, op, length);
 _next_match:
 		/* Encode Offset */
-		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+		LZ4_writeLE16(op, (U16)(ip - match)); op += 2;
 
-		/* Start Counting */
-		ip += MINMATCH;
-		/* MinMatch verified */
-		ref += MINMATCH;
-		anchor = ip;
-		while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) {
-			#if LZ4_ARCH64
-			u64 diff = A64(ref) ^ A64(ip);
-			#else
-			u32 diff = A32(ref) ^ A32(ip);
-			#endif
-			if (!diff) {
-				ip += STEPSIZE;
-				ref += STEPSIZE;
-				continue;
-			}
-			ip += LZ4_NBCOMMONBYTES(diff);
-			goto _endcount;
-		}
-		#if LZ4_ARCH64
-		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
-			ip += 4;
-			ref += 4;
-		}
-		#endif
-		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
-			ip += 2;
-			ref += 2;
-		}
-		if ((ip < MATCHLIMIT) && (*ref == *ip))
-			ip++;
-_endcount:
 		/* Encode MatchLength */
-		length = (int)(ip - anchor);
-		/* Check output limit */
-		if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend))
-			return 0;
-		if (length >= (int)ML_MASK) {
-			*token += ML_MASK;
-			length -= ML_MASK;
-			for (; length > 509 ; length -= 510) {
-				*op++ = 255;
-				*op++ = 255;
-			}
-			if (length > 254) {
-				length -= 255;
-				*op++ = 255;
+		{
+			unsigned int matchCode;
+
+			if ((dict == usingExtDict)
+				&& (lowLimit == dictionary)) {
+				const BYTE *limit;
+
+				match += refDelta;
+				limit = ip + (dictEnd - match);
+				if (limit > matchlimit)
+					limit = matchlimit;
+				matchCode = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, limit);
+				ip += MINMATCH + matchCode;
+				if (ip == limit) {
+					unsigned const int more = LZ4_count(ip,
+						(const BYTE *)source,
+						matchlimit);
+
+					matchCode += more;
+					ip += more;
+				}
+			} else {
+				matchCode = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, matchlimit);
+				ip += MINMATCH + matchCode;
 			}
-			*op++ = (u8)length;
-		} else
-			*token += length;
+
+			if (outputLimited &&
+				/* Check output buffer overflow */
+				(unlikely(op +
+					(1 + LASTLITERALS) +
+					(matchCode>>8) > olimit)))
+				return 0;
+			if (matchCode >= ML_MASK) {
+				*token += ML_MASK;
+				matchCode -= ML_MASK;
+				LZ4_write32(op, 0xFFFFFFFF);
+				while (matchCode >= 4*255) {
+					op += 4;
+					LZ4_write32(op, 0xFFFFFFFF);
+					matchCode -= 4*255;
+				}
+				op += matchCode / 255;
+				*op++ = (BYTE)(matchCode % 255);
+			} else
+				*token += (BYTE)(matchCode);
+		}
+
+		anchor = ip;
 
 		/* Test end of chunk */
-		if (ip > mflimit) {
-			anchor = ip;
+		if (ip > mflimit)
 			break;
-		}
 
 		/* Fill table */
-		hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base;
+		LZ4_putPosition(ip - 2, dictPtr->hashTable, tableType, base);
 
 		/* Test next position */
-		ref = base + hashtable[LZ4_HASH_VALUE(ip)];
-		hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
-		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+		match = LZ4_getPosition(ip, dictPtr->hashTable,
+			tableType, base);
+		if (dict == usingExtDict) {
+			if (match < (const BYTE *)source) {
+				refDelta = dictDelta;
+				lowLimit = dictionary;
+			} else {
+				refDelta = 0;
+				lowLimit = (const BYTE *)source;
+			}
+		}
+		LZ4_putPosition(ip, dictPtr->hashTable, tableType, base);
+		if (((dictIssue == dictSmall) ? (match >= lowRefLimit) : 1)
+			&& (match + MAX_DISTANCE >= ip)
+			&& (LZ4_read32(match + refDelta) == LZ4_read32(ip))) {
 			token = op++;
 			*token = 0;
 			goto _next_match;
 		}
 
 		/* Prepare next loop */
-		anchor = ip++;
-		forwardh = LZ4_HASH_VALUE(ip);
+		forwardH = LZ4_hashPosition(++ip, tableType);
 	}
 
 _last_literals:
 	/* Encode Last Literals */
-	lastrun = (int)(iend - anchor);
-	if (((char *)op - dest) + lastrun + 1
-		+ ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize)
-		return 0;
+	{
+		size_t const lastRun = (size_t)(iend - anchor);
 
-	if (lastrun >= (int)RUN_MASK) {
-		*op++ = (RUN_MASK << ML_BITS);
-		lastrun -= RUN_MASK;
-		for (; lastrun > 254 ; lastrun -= 255)
-			*op++ = 255;
-		*op++ = (u8)lastrun;
-	} else
-		*op++ = (lastrun << ML_BITS);
-	memcpy(op, anchor, iend - anchor);
-	op += iend - anchor;
+		if ((outputLimited) &&
+			/* Check output buffer overflow */
+			((op - (BYTE *)dest) + lastRun + 1 +
+			((lastRun + 255 - RUN_MASK)/255) > (U32)maxOutputSize))
+			return 0;
+		if (lastRun >= RUN_MASK) {
+			size_t accumulator = lastRun - RUN_MASK;
+			*op++ = RUN_MASK << ML_BITS;
+			for (; accumulator >= 255 ; accumulator -= 255)
+				*op++ = 255;
+			*op++ = (BYTE) accumulator;
+		} else {
+			*op++ = (BYTE)(lastRun<<ML_BITS);
+		}
+		memcpy(op, anchor, lastRun);
+		op += lastRun;
+	}
 
 	/* End */
-	return (int)(((char *)op) - dest);
+	return (int) (((char *)op) - dest);
 }
 
-static inline int lz4_compress64kctx(void *ctx,
-		const char *source,
-		char *dest,
-		int isize,
-		int maxoutputsize)
+static int LZ4_compress_fast_extState(void *state, const char *source, char *dest,
+	int inputSize, int maxOutputSize, int acceleration)
 {
-	u16 *hashtable = (u16 *)ctx;
-	const u8 *ip = (u8 *) source;
-	const u8 *anchor = ip;
-	const u8 *const base = ip;
-	const u8 *const iend = ip + isize;
-	const u8 *const mflimit = iend - MFLIMIT;
-	#define MATCHLIMIT (iend - LASTLITERALS)
-
-	u8 *op = (u8 *) dest;
-	u8 *const oend = op + maxoutputsize;
-	int len, length;
-	const int skipstrength = SKIPSTRENGTH;
-	u32 forwardh;
-	int lastrun;
-
-	/* Init */
-	if (isize < MINLENGTH)
-		goto _last_literals;
+	#if LZ4_ARCH64
+	tableType_t tableType = byU32;
+	#else
+	tableType_t tableType = byPtr;
+	#endif
+
+	LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse;
+
+	LZ4_resetStream((LZ4_stream_t *)state);
+
+	if (acceleration < 1)
+		acceleration = LZ4_ACCELERATION_DEFAULT;
+
+	if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+		if (inputSize < LZ4_64Klimit)
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize, 0,
+				noLimit, byU16, noDict,
+				noDictIssue, acceleration);
+		else
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize, 0,
+				noLimit, tableType, noDict,
+				noDictIssue, acceleration);
+	} else {
+		if (inputSize < LZ4_64Klimit)
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize,
+				maxOutputSize, limitedOutput, byU16, noDict,
+				noDictIssue, acceleration);
+		else
+			return LZ4_compress_generic(ctx, source,
+				dest, inputSize,
+				maxOutputSize, limitedOutput, tableType, noDict,
+				noDictIssue, acceleration);
+	}
+}
 
-	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+int LZ4_compress_fast(const char *source, char *dest, int inputSize,
+	int maxOutputSize, int acceleration, void *wrkmem)
+{
+	return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize,
+		maxOutputSize, acceleration);
+}
+EXPORT_SYMBOL(LZ4_compress_fast);
+
+int LZ4_compress_default(const char *source, char *dest, int inputSize,
+	int maxOutputSize, void *wrkmem)
+{
+	return LZ4_compress_fast(source, dest, inputSize,
+		maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem);
+}
+EXPORT_SYMBOL(LZ4_compress_default);
+
+/*-******************************
+ *	*_destSize() variant
+ ********************************/
+
+static int LZ4_compress_destSize_generic(
+	LZ4_stream_t_internal * const ctx,
+	const char * const src,
+	char * const dst,
+	int * const srcSizePtr,
+	const int targetDstSize,
+	const tableType_t tableType)
+{
+	const BYTE *ip = (const BYTE *) src;
+	const BYTE *base = (const BYTE *) src;
+	const BYTE *lowLimit = (const BYTE *) src;
+	const BYTE *anchor = ip;
+	const BYTE * const iend = ip + *srcSizePtr;
+	const BYTE * const mflimit = iend - MFLIMIT;
+	const BYTE * const matchlimit = iend - LASTLITERALS;
+
+	BYTE *op = (BYTE *) dst;
+	BYTE * const oend = op + targetDstSize;
+	BYTE * const oMaxLit = op + targetDstSize - 2 /* offset */
+		- 8 /* because 8 + MINMATCH == MFLIMIT */ - 1 /* token */;
+	BYTE * const oMaxMatch = op + targetDstSize
+		- (LASTLITERALS + 1 /* token */);
+	BYTE * const oMaxSeq = oMaxLit - 1 /* token */;
+
+	U32 forwardH;
+
+	/* Init conditions */
+	/* Impossible to store anything */
+	if (targetDstSize < 1)
+		return 0;
+	/* Unsupported input size, too large (or negative) */
+	if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE)
+		return 0;
+	/* Size too large (not within 64K limit) */
+	if ((tableType == byU16) && (*srcSizePtr >= LZ4_64Klimit))
+		return 0;
+	/* Input too small, no compression (all literals) */
+	if (*srcSizePtr < LZ4_minLength)
+		goto _last_literals;
 
 	/* First Byte */
-	ip++;
-	forwardh = LZ4_HASH64K_VALUE(ip);
+	*srcSizePtr = 0;
+	LZ4_putPosition(ip, ctx->hashTable, tableType, base);
+	ip++; forwardH = LZ4_hashPosition(ip, tableType);
 
 	/* Main Loop */
-	for (;;) {
-		int findmatchattempts = (1U << skipstrength) + 3;
-		const u8 *forwardip = ip;
-		const u8 *ref;
-		u8 *token;
+	for ( ; ; ) {
+		const BYTE *match;
+		BYTE *token;
 
 		/* Find a match */
-		do {
-			u32 h = forwardh;
-			int step = findmatchattempts++ >> skipstrength;
-			ip = forwardip;
-			forwardip = ip + step;
-
-			if (forwardip > mflimit)
-				goto _last_literals;
-
-			forwardh = LZ4_HASH64K_VALUE(forwardip);
-			ref = base + hashtable[h];
-			hashtable[h] = (u16)(ip - base);
-		} while (A32(ref) != A32(ip));
+		{
+			const BYTE *forwardIp = ip;
+			unsigned int step = 1;
+			unsigned int searchMatchNb = 1 << LZ4_skipTrigger;
+
+			do {
+				U32 h = forwardH;
+
+				ip = forwardIp;
+				forwardIp += step;
+				step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+				if (unlikely(forwardIp > mflimit))
+					goto _last_literals;
+
+				match = LZ4_getPositionOnHash(h, ctx->hashTable,
+					tableType, base);
+				forwardH = LZ4_hashPosition(forwardIp,
+					tableType);
+				LZ4_putPositionOnHash(ip, h,
+					ctx->hashTable, tableType,
+					base);
+
+			} while (((tableType == byU16)
+				? 0
+				: (match + MAX_DISTANCE < ip))
+				|| (LZ4_read32(match) != LZ4_read32(ip)));
+		}
 
 		/* Catch up */
-		while ((ip > anchor) && (ref > (u8 *)source)
-			&& (ip[-1] == ref[-1])) {
-			ip--;
-			ref--;
-		}
+		while ((ip > anchor)
+			&& (match > lowLimit)
+			&& (unlikely(ip[-1] == match[-1]))) {
+			ip--; match--;
+			}
 
 		/* Encode Literal length */
-		length = (int)(ip - anchor);
-		token = op++;
-		/* Check output limit */
-		if (unlikely(op + length + (2 + 1 + LASTLITERALS)
-			+ (length >> 8) > oend))
-			return 0;
-		if (length >= (int)RUN_MASK) {
-			*token = (RUN_MASK << ML_BITS);
-			len = length - RUN_MASK;
-			for (; len > 254 ; len -= 255)
-				*op++ = 255;
-			*op++ = (u8)len;
-		} else
-			*token = (length << ML_BITS);
+		{
+			unsigned int litLength = (unsigned int)(ip - anchor);
 
-		/* Copy Literals */
-		LZ4_BLINDCOPY(anchor, op, length);
+			token = op++;
+			if (op + ((litLength + 240)/255)
+				+ litLength > oMaxLit) {
+				/* Not enough space for a last match */
+				op--;
+				goto _last_literals;
+			}
+			if (litLength >= RUN_MASK) {
+				unsigned int len = litLength - RUN_MASK;
+				*token = (RUN_MASK<<ML_BITS);
+				for (; len >= 255 ; len -= 255)
+					*op++ = 255;
+				*op++ = (BYTE)len;
+			} else
+				*token = (BYTE)(litLength<<ML_BITS);
+
+			/* Copy Literals */
+			LZ4_wildCopy(op, anchor, op + litLength);
+			op += litLength;
+		}
 
 _next_match:
 		/* Encode Offset */
-		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+		LZ4_writeLE16(op, (U16)(ip - match)); op += 2;
 
-		/* Start Counting */
-		ip += MINMATCH;
-		/* MinMatch verified */
-		ref += MINMATCH;
-		anchor = ip;
+		/* Encode MatchLength */
+		{
+			size_t matchLength = LZ4_count(ip + MINMATCH,
+			match + MINMATCH, matchlimit);
 
-		while (ip < MATCHLIMIT - (STEPSIZE - 1)) {
-			#if LZ4_ARCH64
-			u64 diff = A64(ref) ^ A64(ip);
-			#else
-			u32 diff = A32(ref) ^ A32(ip);
-			#endif
-
-			if (!diff) {
-				ip += STEPSIZE;
-				ref += STEPSIZE;
-				continue;
+			if (op + ((matchLength + 240)/255) > oMaxMatch) {
+				/* Match description too long : reduce it */
+				matchLength = (15 - 1) + (oMaxMatch - op) * 255;
 			}
-			ip += LZ4_NBCOMMONBYTES(diff);
-			goto _endcount;
+			ip += MINMATCH + matchLength;
+
+			if (matchLength >= ML_MASK) {
+				*token += ML_MASK;
+				matchLength -= ML_MASK;
+				while (matchLength >= 255) {
+					matchLength -= 255; *op++ = 255;
+				}
+				*op++ = (BYTE)matchLength;
+			} else
+				*token += (BYTE)(matchLength);
 		}
-		#if LZ4_ARCH64
-		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
-			ip += 4;
-			ref += 4;
-		}
-		#endif
-		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
-			ip += 2;
-			ref += 2;
-		}
-		if ((ip < MATCHLIMIT) && (*ref == *ip))
-			ip++;
-_endcount:
 
-		/* Encode MatchLength */
-		len = (int)(ip - anchor);
-		/* Check output limit */
-		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
-			return 0;
-		if (len >= (int)ML_MASK) {
-			*token += ML_MASK;
-			len -= ML_MASK;
-			for (; len > 509 ; len -= 510) {
-				*op++ = 255;
-				*op++ = 255;
-			}
-			if (len > 254) {
-				len -= 255;
-				*op++ = 255;
-			}
-			*op++ = (u8)len;
-		} else
-			*token += len;
+		anchor = ip;
 
-		/* Test end of chunk */
-		if (ip > mflimit) {
-			anchor = ip;
+		/* Test end of block */
+		if (ip > mflimit)
+			break;
+		if (op > oMaxSeq)
 			break;
-		}
 
 		/* Fill table */
-		hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base);
+		LZ4_putPosition(ip - 2, ctx->hashTable, tableType, base);
 
 		/* Test next position */
-		ref = base + hashtable[LZ4_HASH64K_VALUE(ip)];
-		hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base);
-		if (A32(ref) == A32(ip)) {
-			token = op++;
-			*token = 0;
+		match = LZ4_getPosition(ip, ctx->hashTable, tableType, base);
+		LZ4_putPosition(ip, ctx->hashTable, tableType, base);
+
+		if ((match + MAX_DISTANCE >= ip)
+			&& (LZ4_read32(match) == LZ4_read32(ip))) {
+			token = op++; *token = 0;
 			goto _next_match;
 		}
 
 		/* Prepare next loop */
-		anchor = ip++;
-		forwardh = LZ4_HASH64K_VALUE(ip);
+		forwardH = LZ4_hashPosition(++ip, tableType);
 	}
 
 _last_literals:
 	/* Encode Last Literals */
-	lastrun = (int)(iend - anchor);
-	if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend)
-		return 0;
-	if (lastrun >= (int)RUN_MASK) {
-		*op++ = (RUN_MASK << ML_BITS);
-		lastrun -= RUN_MASK;
-		for (; lastrun > 254 ; lastrun -= 255)
-			*op++ = 255;
-		*op++ = (u8)lastrun;
-	} else
-		*op++ = (lastrun << ML_BITS);
-	memcpy(op, anchor, iend - anchor);
-	op += iend - anchor;
+	{
+		size_t lastRunSize = (size_t)(iend - anchor);
+
+		if (op + 1 /* token */
+			+ ((lastRunSize + 240)/255) /* litLength */
+			+ lastRunSize /* literals */ > oend) {
+			/* adapt lastRunSize to fill 'dst' */
+			lastRunSize	= (oend - op) - 1;
+			lastRunSize -= (lastRunSize + 240)/255;
+		}
+		ip = anchor + lastRunSize;
+
+		if (lastRunSize >= RUN_MASK) {
+			size_t accumulator = lastRunSize - RUN_MASK;
+
+			*op++ = RUN_MASK << ML_BITS;
+			for (; accumulator >= 255 ; accumulator -= 255)
+				*op++ = 255;
+			*op++ = (BYTE) accumulator;
+		} else {
+			*op++ = (BYTE)(lastRunSize<<ML_BITS);
+		}
+		memcpy(op, anchor, lastRunSize);
+		op += lastRunSize;
+	}
+
 	/* End */
-	return (int)(((char *)op) - dest);
+	*srcSizePtr = (int) (((const char *)ip) - src);
+	return (int) (((char *)op) - dst);
 }
 
-int lz4_compress(const unsigned char *src, size_t src_len,
-			unsigned char *dst, size_t *dst_len, void *wrkmem)
+static int LZ4_compress_destSize_extState(LZ4_stream_t *state, const char *src,
+	char *dst, int *srcSizePtr, int targetDstSize)
 {
-	int ret = -1;
-	int out_len = 0;
+	#if LZ4_ARCH64
+	tableType_t tableType = byU32;
+	#else
+	tableType_t tableType = byPtr;
+	#endif
+
+	LZ4_resetStream(state);
+
+	if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {
+		/* compression success is guaranteed */
+		return LZ4_compress_fast_extState(
+			state, src, dst, *srcSizePtr,
+			targetDstSize, 1);
+	} else {
+		if (*srcSizePtr < LZ4_64Klimit)
+			return LZ4_compress_destSize_generic(
+				&state->internal_donotuse,
+				src, dst, srcSizePtr,
+				targetDstSize, byU16);
+		else
+			return LZ4_compress_destSize_generic(
+				&state->internal_donotuse,
+				src, dst, srcSizePtr,
+				targetDstSize, tableType);
+	}
+}
 
-	if (src_len < LZ4_64KLIMIT)
-		out_len = lz4_compress64kctx(wrkmem, src, dst, src_len,
-				lz4_compressbound(src_len));
-	else
-		out_len = lz4_compressctx(wrkmem, src, dst, src_len,
-				lz4_compressbound(src_len));
 
-	if (out_len < 0)
-		goto exit;
+int LZ4_compress_destSize(const char *src, char *dst, int *srcSizePtr,
+	int targetDstSize, void *wrkmem)
+{
+	return LZ4_compress_destSize_extState(wrkmem, src, dst, srcSizePtr,
+		targetDstSize);
+}
+EXPORT_SYMBOL(LZ4_compress_destSize);
+
+/*-******************************
+ *	Streaming functions
+ ********************************/
+void LZ4_resetStream(LZ4_stream_t *LZ4_stream)
+{
+	memset(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+#define HASH_UNIT sizeof(size_t)
+int LZ4_loadDict(LZ4_stream_t *LZ4_dict,
+	const char *dictionary, int dictSize)
+{
+	LZ4_stream_t_internal *dict = &LZ4_dict->internal_donotuse;
+	const BYTE *p = (const BYTE *)dictionary;
+	const BYTE * const dictEnd = p + dictSize;
+	const BYTE *base;
+
+	if ((dict->initCheck)
+		|| (dict->currentOffset > 1 * GB)) {
+		/* Uninitialized structure, or reuse overflow */
+		LZ4_resetStream(LZ4_dict);
+	}
+
+	if (dictSize < (int)HASH_UNIT) {
+		dict->dictionary = NULL;
+		dict->dictSize = 0;
+		return 0;
+	}
+
+	if ((dictEnd - p) > 64 * KB)
+		p = dictEnd - 64 * KB;
+	dict->currentOffset += 64 * KB;
+	base = p - dict->currentOffset;
+	dict->dictionary = p;
+	dict->dictSize = (U32)(dictEnd - p);
+	dict->currentOffset += dict->dictSize;
+
+	while (p <= dictEnd - HASH_UNIT) {
+		LZ4_putPosition(p, dict->hashTable, byU32, base);
+		p += 3;
+	}
+
+	return dict->dictSize;
+}
+EXPORT_SYMBOL(LZ4_loadDict);
+
+static void LZ4_renormDictT(LZ4_stream_t_internal *LZ4_dict,
+	const BYTE *src)
+{
+	if ((LZ4_dict->currentOffset > 0x80000000) ||
+		((uptrval)LZ4_dict->currentOffset > (uptrval)src)) {
+		/* address space overflow */
+		/* rescale hash table */
+		U32 const delta = LZ4_dict->currentOffset - 64 * KB;
+		const BYTE *dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+		int i;
+
+		for (i = 0; i < LZ4_HASH_SIZE_U32; i++) {
+			if (LZ4_dict->hashTable[i] < delta)
+				LZ4_dict->hashTable[i] = 0;
+			else
+				LZ4_dict->hashTable[i] -= delta;
+		}
+		LZ4_dict->currentOffset = 64 * KB;
+		if (LZ4_dict->dictSize > 64 * KB)
+			LZ4_dict->dictSize = 64 * KB;
+		LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+	}
+}
+
+int LZ4_saveDict(LZ4_stream_t *LZ4_dict, char *safeBuffer, int dictSize)
+{
+	LZ4_stream_t_internal * const dict = &LZ4_dict->internal_donotuse;
+	const BYTE * const previousDictEnd = dict->dictionary + dict->dictSize;
+
+	if ((U32)dictSize > 64 * KB) {
+		/* useless to define a dictionary > 64 * KB */
+		dictSize = 64 * KB;
+	}
+	if ((U32)dictSize > dict->dictSize)
+		dictSize = dict->dictSize;
+
+	memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+	dict->dictionary = (const BYTE *)safeBuffer;
+	dict->dictSize = (U32)dictSize;
+
+	return dictSize;
+}
+EXPORT_SYMBOL(LZ4_saveDict);
+
+int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
+	char *dest, int inputSize, int maxOutputSize, int acceleration)
+{
+	LZ4_stream_t_internal *streamPtr = &LZ4_stream->internal_donotuse;
+	const BYTE * const dictEnd = streamPtr->dictionary
+		+ streamPtr->dictSize;
+
+	const BYTE *smallest = (const BYTE *) source;
+
+	if (streamPtr->initCheck) {
+		/* Uninitialized structure detected */
+		return 0;
+	}
+
+	if ((streamPtr->dictSize > 0) && (smallest > dictEnd))
+		smallest = dictEnd;
+
+	LZ4_renormDictT(streamPtr, smallest);
 
-	*dst_len = out_len;
+	if (acceleration < 1)
+		acceleration = LZ4_ACCELERATION_DEFAULT;
 
-	return 0;
-exit:
-	return ret;
+	/* Check overlapping input/dictionary space */
+	{
+		const BYTE *sourceEnd = (const BYTE *) source + inputSize;
+
+		if ((sourceEnd > streamPtr->dictionary)
+			&& (sourceEnd < dictEnd)) {
+			streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+			if (streamPtr->dictSize > 64 * KB)
+				streamPtr->dictSize = 64 * KB;
+			if (streamPtr->dictSize < 4)
+				streamPtr->dictSize = 0;
+			streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+		}
+	}
+
+	/* prefix mode : source data follows dictionary */
+	if (dictEnd == (const BYTE *)source) {
+		int result;
+
+		if ((streamPtr->dictSize < 64 * KB) &&
+			(streamPtr->dictSize < streamPtr->currentOffset)) {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				withPrefix64k, dictSmall,	acceleration);
+		} else {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				withPrefix64k, noDictIssue, acceleration);
+		}
+		streamPtr->dictSize += (U32)inputSize;
+		streamPtr->currentOffset += (U32)inputSize;
+		return result;
+	}
+
+	/* external dictionary mode */
+	{
+		int result;
+
+		if ((streamPtr->dictSize < 64 * KB) &&
+			(streamPtr->dictSize < streamPtr->currentOffset)) {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				usingExtDict, dictSmall, acceleration);
+		} else {
+			result = LZ4_compress_generic(
+				streamPtr, source, dest, inputSize,
+				maxOutputSize, limitedOutput, byU32,
+				usingExtDict, noDictIssue, acceleration);
+		}
+		streamPtr->dictionary = (const BYTE *)source;
+		streamPtr->dictSize = (U32)inputSize;
+		streamPtr->currentOffset += (U32)inputSize;
+		return result;
+	}
+}
+EXPORT_SYMBOL(LZ4_compress_fast_continue);
+
+/*-******************************
+ *	For backwards compatibility
+ ********************************/
+int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
+	size_t *dst_len, void *wrkmem) {
+	*dst_len = LZ4_compress_default(src, dst, src_len,
+		*dst_len, wrkmem);
+
+	/*
+	 * Prior lz4_compress will return -1 in case of error
+	 * and 0 on success
+	 * while new LZ4_compress_fast/default
+	 * returns 0 in case of error
+	 * and the output length on success
+	 */
+	if (!*dst_len)
+		return -1;
+	else
+		return 0;
 }
 EXPORT_SYMBOL(lz4_compress);
 
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 6d940c72b5fc..fd665ca2ab0c 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -1,25 +1,16 @@
 /*
- * LZ4 Decompressor for Linux kernel
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
- *
- * Based on LZ4 implementation by Yann Collet.
- *
  * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- *
+ * Copyright (C) 2011 - 2016, Yann Collet.
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
- *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -31,313 +22,508 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- *  You can contact the author at :
- *  - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- *  - LZ4 source repository : http://code.google.com/p/lz4/
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
-#ifndef STATIC
+/*-************************************
+ *	Dependencies
+ **************************************/
+#include <linux/lz4.h>
+#include "lz4defs.h"
+#include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#endif
-#include <linux/lz4.h>
-
 #include <asm/unaligned.h>
 
-#include "lz4defs.h"
-
-static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
-#if LZ4_ARCH64
-static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
-#endif
-
-static int lz4_uncompress(const char *source, char *dest, int osize)
+/*-*****************************
+ *	Decompression functions
+ *******************************/
+/* LZ4_decompress_generic() :
+ * This generic decompression function cover all use cases.
+ * It shall be instantiated several times, using different sets of directives
+ * Note that it is important this generic function is really inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+static inline int LZ4_decompress_generic(
+	 const char *const source,
+	 char * const dest,
+	 int inputSize,
+		/*
+		 * If endOnInput == endOnInputSize,
+		 * this value is the max size of Output Buffer.
+		 */
+	 int outputSize,
+	 /* endOnOutputSize, endOnInputSize */
+	 int endOnInput,
+	 /* full, partial */
+	 int partialDecoding,
+	 /* only used if partialDecoding == partial */
+	 int targetOutputSize,
+	 /* noDict, withPrefix64k, usingExtDict */
+	 int dict,
+	 /* == dest when no prefix */
+	 const BYTE * const lowPrefix,
+	 /* only if dict == usingExtDict */
+	 const BYTE * const dictStart,
+	 /* note : = 0 if noDict */
+	 const size_t dictSize
+	 )
 {
+	/* Local Variables */
 	const BYTE *ip = (const BYTE *) source;
-	const BYTE *ref;
+	const BYTE * const iend = ip + inputSize;
+
 	BYTE *op = (BYTE *) dest;
-	BYTE * const oend = op + osize;
+	BYTE * const oend = op + outputSize;
 	BYTE *cpy;
-	unsigned token;
-	size_t length;
+	BYTE *oexit = op + targetOutputSize;
+	const BYTE * const lowLimit = lowPrefix - dictSize;
+
+	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
+	const unsigned int dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};
+	const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+
+	const int safeDecode = (endOnInput == endOnInputSize);
+	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
 
+	/* Special cases */
+	/* targetOutputSize too high => decode everything */
+	if ((partialDecoding) && (oexit > oend - MFLIMIT))
+		oexit = oend - MFLIMIT;
+
+	/* Empty output buffer */
+	if ((endOnInput) && (unlikely(outputSize == 0)))
+		return ((inputSize == 1) && (*ip == 0)) ? 0 : -1;
+
+	if ((!endOnInput) && (unlikely(outputSize == 0)))
+		return (*ip == 0 ? 1 : -1);
+
+	/* Main Loop : decode sequences */
 	while (1) {
+		size_t length;
+		const BYTE *match;
+		size_t offset;
+
+		/* get literal length */
+		unsigned int const token = *ip++;
+
+		length = token>>ML_BITS;
 
-		/* get runlength */
-		token = *ip++;
-		length = (token >> ML_BITS);
 		if (length == RUN_MASK) {
-			size_t len;
+			unsigned int s;
 
-			len = *ip++;
-			for (; len == 255; length += 255)
-				len = *ip++;
-			if (unlikely(length > (size_t)(length + len)))
+			do {
+				s = *ip++;
+				length += s;
+			} while (likely(endOnInput
+				? ip < iend - RUN_MASK
+				: 1) & (s == 255));
+
+			if ((safeDecode)
+				&& unlikely(
+					(size_t)(op + length) < (size_t)(op))) {
+				/* overflow detection */
 				goto _output_error;
-			length += len;
+			}
+			if ((safeDecode)
+				&& unlikely(
+					(size_t)(ip + length) < (size_t)(ip))) {
+				/* overflow detection */
+				goto _output_error;
+			}
 		}
 
 		/* copy literals */
 		cpy = op + length;
-		if (unlikely(cpy > oend - COPYLENGTH)) {
-			/*
-			 * Error: not enough place for another match
-			 * (min 4) + 5 literals
-			 */
-			if (cpy != oend)
-				goto _output_error;
-
+		if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
+			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
+			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
+			if (partialDecoding) {
+				if (cpy > oend) {
+					/*
+					 * Error :
+					 * write attempt beyond end of output buffer
+					 */
+					goto _output_error;
+				}
+				if ((endOnInput)
+					&& (ip + length > iend)) {
+					/*
+					 * Error :
+					 * read attempt beyond
+					 * end of input buffer
+					 */
+					goto _output_error;
+				}
+			} else {
+				if ((!endOnInput)
+					&& (cpy != oend)) {
+					/*
+					 * Error :
+					 * block decoding must
+					 * stop exactly there
+					 */
+					goto _output_error;
+				}
+				if ((endOnInput)
+					&& ((ip + length != iend)
+					|| (cpy > oend))) {
+					/*
+					 * Error :
+					 * input must be consumed
+					 */
+					goto _output_error;
+				}
+			}
 			memcpy(op, ip, length);
 			ip += length;
-			break; /* EOF */
+			op += length;
+			/* Necessarily EOF, due to parsing restrictions */
+			break;
 		}
-		LZ4_WILDCOPY(ip, op, cpy);
-		ip -= (op - cpy);
-		op = cpy;
+		LZ4_wildCopy(op, ip, cpy);
+		ip += length; op = cpy;
 
 		/* get offset */
-		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
-		ip += 2;
-
-		/* Error: offset create reference outside destination buffer */
-		if (unlikely(ref < (BYTE *const) dest))
+		offset = LZ4_readLE16(ip); ip += 2;
+		match = op - offset;
+		if ((checkOffset) && (unlikely(match < lowLimit))) {
+			/* Error : offset outside buffers */
 			goto _output_error;
+		}
+		/* costs ~1%; silence an msan warning when offset == 0 */
+		LZ4_write32(op, (U32)offset);
 
 		/* get matchlength */
 		length = token & ML_MASK;
 		if (length == ML_MASK) {
-			for (; *ip == 255; length += 255)
-				ip++;
-			if (unlikely(length > (size_t)(length + *ip)))
+			unsigned int s;
+
+			do {
+			s = *ip++;
+			if ((endOnInput) && (ip > iend - LASTLITERALS))
+				goto _output_error;
+			length += s;
+			} while (s == 255);
+			if ((safeDecode)
+				&& unlikely(
+					(size_t)(op + length) < (size_t)op)) {
+				/* overflow detection */
 				goto _output_error;
-			length += *ip++;
+			}
 		}
+		length += MINMATCH;
+
+		/* check external dictionary */
+		if ((dict == usingExtDict) && (match < lowPrefix)) {
+			if (unlikely(op + length > oend - LASTLITERALS)) {
+				/* doesn't respect parsing restriction */
+				goto _output_error;
+			}
 
-		/* copy repeated sequence */
-		if (unlikely((op - ref) < STEPSIZE)) {
-#if LZ4_ARCH64
-			int dec64 = dec64table[op - ref];
-#else
-			const int dec64 = 0;
-#endif
-			op[0] = ref[0];
-			op[1] = ref[1];
-			op[2] = ref[2];
-			op[3] = ref[3];
-			op += 4;
-			ref += 4;
-			ref -= dec32table[op-ref];
-			PUT4(ref, op);
-			op += STEPSIZE - 4;
-			ref -= dec64;
+			if (length <= (size_t)(lowPrefix - match)) {
+			/*
+			 * match can be copied as a single segment
+			 * from external dictionary
+			 */
+			memmove(op, dictEnd - (lowPrefix - match), length);
+			op += length;
+			} else {
+				/*
+				 * match encompass external
+				 * dictionary and current block
+				 */
+				size_t const copySize = (size_t)(lowPrefix - match);
+				size_t const restSize = length - copySize;
+
+				memcpy(op, dictEnd - copySize, copySize);
+				op += copySize;
+
+				if (restSize > (size_t)(op - lowPrefix)) {
+					/* overlap copy */
+					BYTE * const endOfMatch = op + restSize;
+					const BYTE *copyFrom = lowPrefix;
+
+					while (op < endOfMatch)
+						*op++ = *copyFrom++;
+				} else {
+					memcpy(op, lowPrefix, restSize);
+					op += restSize;
+				}
+			}
+			continue;
+		}
+
+		/* copy match within block */
+		cpy = op + length;
+		if (unlikely(offset < 8)) {
+			const int dec64 = dec64table[offset];
+
+			op[0] = match[0];
+			op[1] = match[1];
+			op[2] = match[2];
+			op[3] = match[3];
+			match += dec32table[offset];
+			memcpy(op + 4, match, 4);
+			match -= dec64;
 		} else {
-			LZ4_COPYSTEP(ref, op);
+			LZ4_copy8(op, match); match += 8;
 		}
-		cpy = op + length - (STEPSIZE - 4);
-		if (cpy > (oend - COPYLENGTH)) {
 
-			/* Error: request to write beyond destination buffer */
-			if (cpy > oend)
-				goto _output_error;
-#if LZ4_ARCH64
-			if ((ref + COPYLENGTH) > oend)
-#else
-			if ((ref + COPYLENGTH) > oend ||
-					(op + COPYLENGTH) > oend)
-#endif
+		op += 8;
+
+		if (unlikely(cpy > oend - 12)) {
+			BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
+
+			if (cpy > oend - LASTLITERALS) {
+				/*
+				 * Error : last LASTLITERALS bytes
+				 * must be literals (uncompressed)
+				 */
 				goto _output_error;
-			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			}
+			if (op < oCopyLimit) {
+				LZ4_wildCopy(op, match, oCopyLimit);
+				match += oCopyLimit - op;
+				op = oCopyLimit;
+			}
 			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			/*
-			 * Check EOF (should never happen, since last 5 bytes
-			 * are supposed to be literals)
-			 */
-			if (op == oend)
-				goto _output_error;
-			continue;
+				*op++ = *match++;
+		} else {
+			LZ4_copy8(op, match);
+			if (length > 16)
+				LZ4_wildCopy(op + 8, match + 8, cpy);
 		}
-		LZ4_SECURECOPY(ref, op, cpy);
 		op = cpy; /* correction */
 	}
+
 	/* end of decoding */
-	return (int) (((char *)ip) - source);
+	if (endOnInput) {
+		/* Nb of output bytes decoded */
+		return (int) (((char *)op) - dest);
+	} else {
+		/* Nb of input bytes read */
+		return (int) (((const char *)ip) - source);
+	}
 
-	/* write overflow error detected */
+	/* Overflow error detected */
 _output_error:
 	return -1;
 }
 
-static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
-				int isize, size_t maxoutputsize)
+int LZ4_decompress_safe(const char *source, char *dest,
+	int compressedSize, int maxDecompressedSize)
 {
-	const BYTE *ip = (const BYTE *) source;
-	const BYTE *const iend = ip + isize;
-	const BYTE *ref;
-
+	return LZ4_decompress_generic(source, dest, compressedSize,
+		maxDecompressedSize, endOnInputSize, full, 0,
+		noDict, (BYTE *)dest, NULL, 0);
+}
+EXPORT_SYMBOL(LZ4_decompress_safe);
 
-	BYTE *op = (BYTE *) dest;
-	BYTE * const oend = op + maxoutputsize;
-	BYTE *cpy;
+int LZ4_decompress_safe_partial(const char *source, char *dest,
+	int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+	return LZ4_decompress_generic(source, dest, compressedSize,
+		maxDecompressedSize, endOnInputSize, partial,
+		targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
+}
+EXPORT_SYMBOL(LZ4_decompress_safe_partial);
 
-	/* Main Loop */
-	while (ip < iend) {
+int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
+{
+	return LZ4_decompress_generic(source, dest, 0, originalSize,
+		endOnOutputSize, full, 0, withPrefix64k,
+		(BYTE *)(dest - 64 * KB), NULL, 64 * KB);
+}
+EXPORT_SYMBOL(LZ4_decompress_fast);
 
-		unsigned token;
-		size_t length;
+int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *dictionary, int dictSize)
+{
+	LZ4_streamDecode_t_internal *lz4sd = (LZ4_streamDecode_t_internal *) LZ4_streamDecode;
 
-		/* get runlength */
-		token = *ip++;
-		length = (token >> ML_BITS);
-		if (length == RUN_MASK) {
-			int s = 255;
-			while ((ip < iend) && (s == 255)) {
-				s = *ip++;
-				if (unlikely(length > (size_t)(length + s)))
-					goto _output_error;
-				length += s;
-			}
-		}
-		/* copy literals */
-		cpy = op + length;
-		if ((cpy > oend - COPYLENGTH) ||
-			(ip + length > iend - COPYLENGTH)) {
-
-			if (cpy > oend)
-				goto _output_error;/* writes beyond buffer */
-
-			if (ip + length != iend)
-				goto _output_error;/*
-						    * Error: LZ4 format requires
-						    * to consume all input
-						    * at this stage
-						    */
-			memcpy(op, ip, length);
-			op += length;
-			break;/* Necessarily EOF, due to parsing restrictions */
-		}
-		LZ4_WILDCOPY(ip, op, cpy);
-		ip -= (op - cpy);
-		op = cpy;
+	lz4sd->prefixSize = (size_t) dictSize;
+	lz4sd->prefixEnd = (const BYTE *) dictionary + dictSize;
+	lz4sd->externalDict = NULL;
+	lz4sd->extDictSize	= 0;
+	return 1;
+}
+EXPORT_SYMBOL(LZ4_setStreamDecode);
 
-		/* get offset */
-		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
-		ip += 2;
-		if (ref < (BYTE * const) dest)
-			goto _output_error;
-			/*
-			 * Error : offset creates reference
-			 * outside of destination buffer
-			 */
+/*
+ * *_continue() :
+ * These decoding functions allow decompression of multiple blocks
+ * in "streaming" mode.
+ * Previously decoded blocks must still be available at the memory
+ * position where they were decoded.
+ * If it's not possible, save the relevant part of
+ * decoded data into a safe buffer,
+ * and indicate where it stands using LZ4_setStreamDecode()
+ */
+int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int compressedSize, int maxOutputSize)
+{
+	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
+	int result;
+
+	if (lz4sd->prefixEnd == (BYTE *)dest) {
+		result = LZ4_decompress_generic(source, dest,
+			compressedSize,
+			maxOutputSize,
+			endOnInputSize, full, 0,
+			usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
+			lz4sd->externalDict,
+			lz4sd->extDictSize);
+
+		if (result <= 0)
+			return result;
+
+		lz4sd->prefixSize += result;
+		lz4sd->prefixEnd	+= result;
+	} else {
+		lz4sd->extDictSize = lz4sd->prefixSize;
+		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+		result = LZ4_decompress_generic(source, dest,
+			compressedSize, maxOutputSize,
+			endOnInputSize, full, 0,
+			usingExtDict, (BYTE *)dest,
+			lz4sd->externalDict, lz4sd->extDictSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize = result;
+		lz4sd->prefixEnd	= (BYTE *)dest + result;
+	}
 
-		/* get matchlength */
-		length = (token & ML_MASK);
-		if (length == ML_MASK) {
-			while (ip < iend) {
-				int s = *ip++;
-				if (unlikely(length > (size_t)(length + s)))
-					goto _output_error;
-				length += s;
-				if (s == 255)
-					continue;
-				break;
-			}
-		}
+	return result;
+}
+EXPORT_SYMBOL(LZ4_decompress_safe_continue);
 
-		/* copy repeated sequence */
-		if (unlikely((op - ref) < STEPSIZE)) {
-#if LZ4_ARCH64
-			int dec64 = dec64table[op - ref];
-#else
-			const int dec64 = 0;
-#endif
-				op[0] = ref[0];
-				op[1] = ref[1];
-				op[2] = ref[2];
-				op[3] = ref[3];
-				op += 4;
-				ref += 4;
-				ref -= dec32table[op - ref];
-				PUT4(ref, op);
-				op += STEPSIZE - 4;
-				ref -= dec64;
-		} else {
-			LZ4_COPYSTEP(ref, op);
-		}
-		cpy = op + length - (STEPSIZE-4);
-		if (cpy > oend - COPYLENGTH) {
-			if (cpy > oend)
-				goto _output_error; /* write outside of buf */
-#if LZ4_ARCH64
-			if ((ref + COPYLENGTH) > oend)
-#else
-			if ((ref + COPYLENGTH) > oend ||
-					(op + COPYLENGTH) > oend)
-#endif
-				goto _output_error;
-			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
-			while (op < cpy)
-				*op++ = *ref++;
-			op = cpy;
-			/*
-			 * Check EOF (should never happen, since last 5 bytes
-			 * are supposed to be literals)
-			 */
-			if (op == oend)
-				goto _output_error;
-			continue;
-		}
-		LZ4_SECURECOPY(ref, op, cpy);
-		op = cpy; /* correction */
+int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
+	const char *source, char *dest, int originalSize)
+{
+	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
+	int result;
+
+	if (lz4sd->prefixEnd == (BYTE *)dest) {
+		result = LZ4_decompress_generic(source, dest, 0, originalSize,
+			endOnOutputSize, full, 0,
+			usingExtDict,
+			lz4sd->prefixEnd - lz4sd->prefixSize,
+			lz4sd->externalDict, lz4sd->extDictSize);
+
+		if (result <= 0)
+			return result;
+
+		lz4sd->prefixSize += originalSize;
+		lz4sd->prefixEnd	+= originalSize;
+	} else {
+		lz4sd->extDictSize = lz4sd->prefixSize;
+		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+		result = LZ4_decompress_generic(source, dest, 0, originalSize,
+			endOnOutputSize, full, 0,
+			usingExtDict, (BYTE *)dest,
+			lz4sd->externalDict, lz4sd->extDictSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize = originalSize;
+		lz4sd->prefixEnd	= (BYTE *)dest + originalSize;
 	}
-	/* end of decoding */
-	return (int) (((char *) op) - dest);
 
-	/* write overflow error detected */
-_output_error:
-	return -1;
+	return result;
 }
+EXPORT_SYMBOL(LZ4_decompress_fast_continue);
 
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-		unsigned char *dest, size_t actual_dest_len)
+/*
+ * Advanced decoding functions :
+ * *_usingDict() :
+ * These decoding functions work the same as "_continue" ones,
+ * the dictionary must be explicitly provided within parameters
+ */
+static inline int LZ4_decompress_usingDict_generic(const char *source,
+	char *dest, int compressedSize, int maxOutputSize, int safe,
+	const char *dictStart, int dictSize)
 {
-	int ret = -1;
-	int input_len = 0;
-
-	input_len = lz4_uncompress(src, dest, actual_dest_len);
-	if (input_len < 0)
-		goto exit_0;
-	*src_len = input_len;
+	if (dictSize == 0)
+		return LZ4_decompress_generic(source, dest,
+			compressedSize, maxOutputSize, safe, full, 0,
+			noDict, (BYTE *)dest, NULL, 0);
+	if (dictStart + dictSize == dest) {
+		if (dictSize >= (int)(64 * KB - 1))
+			return LZ4_decompress_generic(source, dest,
+				compressedSize, maxOutputSize, safe, full, 0,
+				withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
+		return LZ4_decompress_generic(source, dest, compressedSize,
+			maxOutputSize, safe, full, 0, noDict,
+			(BYTE *)dest - dictSize, NULL, 0);
+	}
+	return LZ4_decompress_generic(source, dest, compressedSize,
+		maxOutputSize, safe, full, 0, usingExtDict,
+		(BYTE *)dest, (const BYTE *)dictStart, dictSize);
+}
 
-	return 0;
-exit_0:
-	return ret;
+int LZ4_decompress_safe_usingDict(const char *source, char *dest,
+	int compressedSize, int maxOutputSize,
+	const char *dictStart, int dictSize)
+{
+	return LZ4_decompress_usingDict_generic(source, dest,
+		compressedSize, maxOutputSize, 1, dictStart, dictSize);
 }
-#ifndef STATIC
-EXPORT_SYMBOL(lz4_decompress);
-#endif
+EXPORT_SYMBOL(LZ4_decompress_safe_usingDict);
 
-int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-		unsigned char *dest, size_t *dest_len)
+int LZ4_decompress_fast_usingDict(const char *source, char *dest,
+	int originalSize, const char *dictStart, int dictSize)
 {
-	int ret = -1;
-	int out_len = 0;
-
-	out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
-					*dest_len);
-	if (out_len < 0)
-		goto exit_0;
-	*dest_len = out_len;
-
-	return 0;
-exit_0:
-	return ret;
+	return LZ4_decompress_usingDict_generic(source, dest, 0,
+		originalSize, 0, dictStart, dictSize);
+}
+EXPORT_SYMBOL(LZ4_decompress_fast_usingDict);
+
+/*-******************************
+ *	For backwards compatibility
+ ********************************/
+int lz4_decompress_unknownoutputsize(const unsigned char *src,
+	size_t src_len, unsigned char *dest, size_t *dest_len) {
+	*dest_len = LZ4_decompress_safe(src, dest,
+		src_len, *dest_len);
+
+	/*
+	 * Prior lz4_decompress_unknownoutputsize will return
+	 * 0 for success and a negative result for error
+	 * new LZ4_decompress_safe returns
+	 * - the length of data read on success
+	 * - and also a negative result on error
+	 * meaning when result > 0, we just return 0 here
+	 */
+	if (src_len > 0)
+		return 0;
+	else
+		return -1;
 }
-#ifndef STATIC
 EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
 
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+	unsigned char *dest, size_t actual_dest_len) {
+	*src_len = LZ4_decompress_fast(src, dest, actual_dest_len);
+
+	/*
+	 * Prior lz4_decompress will return
+	 * 0 for success and a negative result for error
+	 * new LZ4_decompress_fast returns
+	 * - the length of data read on success
+	 * - and also a negative result on error
+	 * meaning when result > 0, we just return 0 here
+	 */
+	if (*src_len > 0)
+		return 0;
+	else
+		return -1;
+}
+EXPORT_SYMBOL(lz4_decompress);
+
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4 Decompressor");
-#endif
+MODULE_DESCRIPTION("LZ4 decompressor");
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index c79d7ea8a38e..23e1a1bd0d8d 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -1,157 +1,229 @@
+#ifndef __LZ4DEFS_H__
+#define __LZ4DEFS_H__
+
 /*
- * lz4defs.h -- architecture specific defines
- *
- * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ * lz4defs.h -- common and architecture specific defines for the kernel usage
+
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2016, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
+#include <asm/unaligned.h>
+#include <linux/string.h>	 /* memset, memcpy */
+
 /*
  * Detects 64 bits mode
- */
+*/
 #if defined(CONFIG_64BIT)
 #define LZ4_ARCH64 1
 #else
 #define LZ4_ARCH64 0
 #endif
 
-/*
- * Architecture-specific macros
- */
-#define BYTE	u8
-typedef struct _U16_S { u16 v; } U16_S;
-typedef struct _U32_S { u32 v; } U32_S;
-typedef struct _U64_S { u64 v; } U64_S;
-#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-
-#define A16(x) (((U16_S *)(x))->v)
-#define A32(x) (((U32_S *)(x))->v)
-#define A64(x) (((U64_S *)(x))->v)
-
-#define PUT4(s, d) (A32(d) = A32(s))
-#define PUT8(s, d) (A64(d) = A64(s))
-
-#define LZ4_READ_LITTLEENDIAN_16(d, s, p)	\
-	(d = s - A16(p))
-
-#define LZ4_WRITE_LITTLEENDIAN_16(p, v)	\
-	do {	\
-		A16(p) = v; \
-		p += 2; \
-	} while (0)
-#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
-
-#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v))
-#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v))
-#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v))
-
-#define PUT4(s, d) \
-	put_unaligned(get_unaligned((const u32 *) s), (u32 *) d)
-#define PUT8(s, d) \
-	put_unaligned(get_unaligned((const u64 *) s), (u64 *) d)
-
-#define LZ4_READ_LITTLEENDIAN_16(d, s, p)	\
-	(d = s - get_unaligned_le16(p))
-
-#define LZ4_WRITE_LITTLEENDIAN_16(p, v)			\
-	do {						\
-		put_unaligned_le16(v, (u16 *)(p));	\
-		p += 2;					\
-	} while (0)
-#endif
+/*-************************************
+ *	Basic Types
+ **************************************/
+#include <linux/types.h>
 
-#define COPYLENGTH 8
-#define ML_BITS  4
-#define ML_MASK  ((1U << ML_BITS) - 1)
-#define RUN_BITS (8 - ML_BITS)
-#define RUN_MASK ((1U << RUN_BITS) - 1)
-#define MEMORY_USAGE	14
-#define MINMATCH	4
-#define SKIPSTRENGTH	6
-#define LASTLITERALS	5
-#define MFLIMIT		(COPYLENGTH + MINMATCH)
-#define MINLENGTH	(MFLIMIT + 1)
-#define MAXD_LOG	16
-#define MAXD		(1 << MAXD_LOG)
-#define MAXD_MASK	(u32)(MAXD - 1)
-#define MAX_DISTANCE	(MAXD - 1)
-#define HASH_LOG	(MAXD_LOG - 1)
-#define HASHTABLESIZE	(1 << HASH_LOG)
-#define MAX_NB_ATTEMPTS	256
-#define OPTIMAL_ML	(int)((ML_MASK-1)+MINMATCH)
-#define LZ4_64KLIMIT	((1<<16) + (MFLIMIT - 1))
-#define HASHLOG64K	((MEMORY_USAGE - 2) + 1)
-#define HASH64KTABLESIZE	(1U << HASHLOG64K)
-#define LZ4_HASH_VALUE(p)	(((A32(p)) * 2654435761U) >> \
-				((MINMATCH * 8) - (MEMORY_USAGE-2)))
-#define LZ4_HASH64K_VALUE(p)	(((A32(p)) * 2654435761U) >> \
-				((MINMATCH * 8) - HASHLOG64K))
-#define HASH_VALUE(p)		(((A32(p)) * 2654435761U) >> \
-				((MINMATCH * 8) - HASH_LOG))
-
-#if LZ4_ARCH64/* 64-bit */
-#define STEPSIZE 8
-
-#define LZ4_COPYSTEP(s, d)	\
-	do {			\
-		PUT8(s, d);	\
-		d += 8;		\
-		s += 8;		\
-	} while (0)
-
-#define LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
-
-#define LZ4_SECURECOPY(s, d, e)			\
-	do {					\
-		if (d < e) {			\
-			LZ4_WILDCOPY(s, d, e);	\
-		}				\
-	} while (0)
-#define HTYPE u32
-
-#ifdef __BIG_ENDIAN
-#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+typedef	uint8_t BYTE;
+typedef uint16_t U16;
+typedef uint32_t U32;
+typedef	int32_t S32;
+typedef uint64_t U64;
+typedef uintptr_t uptrval;
+
+/*-************************************
+ *	Constants
+ **************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (WILDCOPYLENGTH+MINMATCH)
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB (1<<10)
+#define MB (1<<20)
+#define GB (1U<<30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1<<MAXD_LOG) - 1)
+#define STEPSIZE sizeof(size_t)
+
+#define ML_BITS	4
+#define ML_MASK	((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;
+
+/*-************************************
+ *	Reading and writing into memory
+ **************************************/
+
+static inline U16 LZ4_read16(const void *memPtr)
+{
+	U16 val;
+
+	memcpy(&val, memPtr, sizeof(val));
+
+	return val;
+}
+
+static inline U32 LZ4_read32(const void *memPtr)
+{
+	U32 val;
+
+	memcpy(&val, memPtr, sizeof(val));
+
+	return val;
+}
+
+static inline size_t LZ4_read_ARCH(const void *memPtr)
+{
+	size_t val;
+
+	memcpy(&val, memPtr, sizeof(val));
+
+	return val;
+}
+
+static inline void LZ4_write16(void *memPtr, U16 value)
+{
+	memcpy(memPtr, &value, sizeof(value));
+}
+
+static inline void LZ4_write32(void *memPtr, U32 value)
+{
+	memcpy(memPtr, &value, sizeof(value));
+}
+
+static inline U16 LZ4_readLE16(const void *memPtr)
+{
+#ifdef __LITTLE_ENDIAN__
+	return LZ4_read16(memPtr);
 #else
-#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
-#endif
+	const BYTE *p = (const BYTE *)memPtr;
 
-#else	/* 32-bit */
-#define STEPSIZE 4
+	return (U16)((U16)p[0] + (p[1] << 8));
+#endif
+}
 
-#define LZ4_COPYSTEP(s, d)	\
-	do {			\
-		PUT4(s, d);	\
-		d += 4;		\
-		s += 4;		\
-	} while (0)
+static inline void LZ4_writeLE16(void *memPtr, U16 value)
+{
+#ifdef __LITTLE_ENDIAN__
+	LZ4_write16(memPtr, value);
+#else
+	BYTE *p = (BYTE *)memPtr;
 
-#define LZ4_COPYPACKET(s, d)		\
-	do {				\
-		LZ4_COPYSTEP(s, d);	\
-		LZ4_COPYSTEP(s, d);	\
-	} while (0)
+	p[0] = (BYTE) value;
+	p[1] = (BYTE)(value>>8);
+#endif
+}
 
-#define LZ4_SECURECOPY	LZ4_WILDCOPY
-#define HTYPE const u8*
+static inline void LZ4_copy8(void *dst, const void *src)
+{
+	memcpy(dst, src, 8);
+}
 
-#ifdef __BIG_ENDIAN
+/*
+ * customized variant of memcpy,
+ * which can overwrite up to 7 bytes beyond dstEnd
+ */
+static inline void LZ4_wildCopy(void *dstPtr, const void *srcPtr, void *dstEnd)
+{
+	BYTE *d = (BYTE *)dstPtr;
+	const BYTE *s = (const BYTE *)srcPtr;
+	BYTE *const e = (BYTE *)dstEnd;
+
+	do {
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+	} while (d < e);
+}
+
+#if LZ4_ARCH64
+#ifdef __BIG_ENDIAN__
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
+#endif
+#else
+#ifdef __BIG_ENDIAN__
 #define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
 #else
 #define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
 #endif
+#endif
 
+static inline unsigned int LZ4_count(const BYTE *pIn, const BYTE *pMatch,
+	const BYTE *pInLimit)
+{
+	const BYTE *const pStart = pIn;
+
+	while (likely(pIn < pInLimit-(STEPSIZE-1))) {
+		size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+
+		if (!diff) {
+			pIn += STEPSIZE;
+			pMatch += STEPSIZE;
+			continue;
+		}
+		pIn += LZ4_NBCOMMONBYTES(diff);
+		return (unsigned int)(pIn - pStart);
+	}
+
+#ifdef LZ4_ARCH64
+	if ((pIn < (pInLimit-3))
+		&& (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
+		pIn += 4; pMatch += 4;
+	}
 #endif
+	if ((pIn < (pInLimit-1))
+		&& (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
+		pIn += 2; pMatch += 2;
+	}
+	if ((pIn < pInLimit) && (*pMatch == *pIn))
+		pIn++;
+	return (unsigned int)(pIn - pStart);
+}
+
+typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
 
-#define LZ4_WILDCOPY(s, d, e)		\
-	do {				\
-		LZ4_COPYPACKET(s, d);	\
-	} while (d < e)
-
-#define LZ4_BLINDCOPY(s, d, l)	\
-	do {	\
-		u8 *e = (d) + l;	\
-		LZ4_WILDCOPY(s, d, e);	\
-		d = e;	\
-	} while (0)
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+#endif
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index f344f76b6559..f1d4662e59df 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -1,19 +1,17 @@
 /*
  * LZ4 HC - High Compression Mode of LZ4
- * Copyright (C) 2011-2012, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Copyright (C) 2011-2015, Yann Collet.
  *
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
  * copyright notice, this list of conditions and the following disclaimer
  * in the documentation and/or other materials provided with the
  * distribution.
- *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -25,323 +23,361 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
  * You can contact the author at :
- * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
- * - LZ4 source repository : http://code.google.com/p/lz4/
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
  *
- *  Changed for kernel use by:
- *  Chanho Min <chanho.min@lge.com>
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
+/*-************************************
+ *	Dependencies
+ **************************************/
 #include <linux/lz4.h>
-#include <asm/unaligned.h>
 #include "lz4defs.h"
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memset */
+
+/* *************************************
+ *	Local Constants and types
+ ***************************************/
 
-struct lz4hc_data {
-	const u8 *base;
-	HTYPE hashtable[HASHTABLESIZE];
-	u16 chaintable[MAXD];
-	const u8 *nexttoupdate;
-} __attribute__((__packed__));
+#define OPTIMAL_ML (int)((ML_MASK - 1) + MINMATCH)
 
-static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
+#define HASH_FUNCTION(i)	(((i) * 2654435761U) \
+	>> ((MINMATCH*8) - LZ4HC_HASH_LOG))
+#define DELTANEXTU16(p)	chainTable[(U16)(p)] /* faster */
+
+static U32 LZ4HC_hashPtr(const void *ptr)
+{
+	return HASH_FUNCTION(LZ4_read32(ptr));
+}
+
+/**************************************
+ *	HC Compression
+ **************************************/
+static void LZ4HC_init(LZ4HC_CCtx_internal *hc4, const BYTE *start)
 {
-	memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
-	memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
-
-#if LZ4_ARCH64
-	hc4->nexttoupdate = base + 1;
-#else
-	hc4->nexttoupdate = base;
-#endif
-	hc4->base = base;
-	return 1;
+	memset((void *)hc4->hashTable, 0, sizeof(hc4->hashTable));
+	memset(hc4->chainTable, 0xFF, sizeof(hc4->chainTable));
+	hc4->nextToUpdate = 64 * KB;
+	hc4->base = start - 64 * KB;
+	hc4->end = start;
+	hc4->dictBase = start - 64 * KB;
+	hc4->dictLimit = 64 * KB;
+	hc4->lowLimit = 64 * KB;
 }
 
 /* Update chains up to ip (excluded) */
-static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
+static inline void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4,
+	const BYTE *ip)
 {
-	u16 *chaintable = hc4->chaintable;
-	HTYPE *hashtable  = hc4->hashtable;
-#if LZ4_ARCH64
+	U16 * const chainTable = hc4->chainTable;
+	U32 * const hashTable	= hc4->hashTable;
 	const BYTE * const base = hc4->base;
-#else
-	const int base = 0;
-#endif
+	U32 const target = (U32)(ip - base);
+	U32 idx = hc4->nextToUpdate;
+
+	while (idx < target) {
+		U32 const h = LZ4HC_hashPtr(base + idx);
+		size_t delta = idx - hashTable[h];
 
-	while (hc4->nexttoupdate < ip) {
-		const u8 *p = hc4->nexttoupdate;
-		size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
 		if (delta > MAX_DISTANCE)
 			delta = MAX_DISTANCE;
-		chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
-		hashtable[HASH_VALUE(p)] = (p) - base;
-		hc4->nexttoupdate++;
-	}
-}
 
-static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2,
-		const u8 *const matchlimit)
-{
-	const u8 *p1t = p1;
-
-	while (p1t < matchlimit - (STEPSIZE - 1)) {
-#if LZ4_ARCH64
-		u64 diff = A64(p2) ^ A64(p1t);
-#else
-		u32 diff = A32(p2) ^ A32(p1t);
-#endif
-		if (!diff) {
-			p1t += STEPSIZE;
-			p2 += STEPSIZE;
-			continue;
-		}
-		p1t += LZ4_NBCOMMONBYTES(diff);
-		return p1t - p1;
-	}
-#if LZ4_ARCH64
-	if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) {
-		p1t += 4;
-		p2 += 4;
-	}
-#endif
+		DELTANEXTU16(idx) = (U16)delta;
 
-	if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) {
-		p1t += 2;
-		p2 += 2;
+		hashTable[h] = idx;
+		idx++;
 	}
-	if ((p1t < matchlimit) && (*p2 == *p1t))
-		p1t++;
-	return p1t - p1;
+
+	hc4->nextToUpdate = target;
 }
 
-static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
-		const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
+static inline int LZ4HC_InsertAndFindBestMatch(
+	LZ4HC_CCtx_internal *hc4, /* Index table will be updated */
+	const BYTE *ip,
+	const BYTE * const iLimit,
+	const BYTE **matchpos,
+	const int maxNbAttempts)
 {
-	u16 *const chaintable = hc4->chaintable;
-	HTYPE *const hashtable = hc4->hashtable;
-	const u8 *ref;
-#if LZ4_ARCH64
+	U16 * const chainTable = hc4->chainTable;
+	U32 * const HashTable = hc4->hashTable;
 	const BYTE * const base = hc4->base;
-#else
-	const int base = 0;
-#endif
-	int nbattempts = MAX_NB_ATTEMPTS;
-	size_t repl = 0, ml = 0;
-	u16 delta;
+	const BYTE * const dictBase = hc4->dictBase;
+	const U32 dictLimit = hc4->dictLimit;
+	const U32 lowLimit = (hc4->lowLimit + 64 * KB > (U32)(ip - base))
+		? hc4->lowLimit
+		: (U32)(ip - base) - (64 * KB - 1);
+	U32 matchIndex;
+	int nbAttempts = maxNbAttempts;
+	size_t ml = 0;
 
 	/* HC4 match finder */
-	lz4hc_insert(hc4, ip);
-	ref = hashtable[HASH_VALUE(ip)] + base;
-
-	/* potential repetition */
-	if (ref >= ip-4) {
-		/* confirmed */
-		if (A32(ref) == A32(ip)) {
-			delta = (u16)(ip-ref);
-			repl = ml  = lz4hc_commonlength(ip + MINMATCH,
-					ref + MINMATCH, matchlimit) + MINMATCH;
-			*matchpos = ref;
-		}
-		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-	}
+	LZ4HC_Insert(hc4, ip);
+	matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+
+	while ((matchIndex >= lowLimit)
+		&& (nbAttempts)) {
+		nbAttempts--;
+		if (matchIndex >= dictLimit) {
+			const BYTE * const match = base + matchIndex;
+
+			if (*(match + ml) == *(ip + ml)
+				&& (LZ4_read32(match) == LZ4_read32(ip))) {
+				size_t const mlt = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, iLimit) + MINMATCH;
 
-	while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
-		nbattempts--;
-		if (*(ref + ml) == *(ip + ml)) {
-			if (A32(ref) == A32(ip)) {
-				size_t mlt =
-					lz4hc_commonlength(ip + MINMATCH,
-					ref + MINMATCH, matchlimit) + MINMATCH;
 				if (mlt > ml) {
 					ml = mlt;
-					*matchpos = ref;
+					*matchpos = match;
+				}
+			}
+		} else {
+			const BYTE * const match = dictBase + matchIndex;
+
+			if (LZ4_read32(match) == LZ4_read32(ip)) {
+				size_t mlt;
+				const BYTE *vLimit = ip
+					+ (dictLimit - matchIndex);
+
+				if (vLimit > iLimit)
+					vLimit = iLimit;
+				mlt = LZ4_count(ip + MINMATCH,
+					match + MINMATCH, vLimit) + MINMATCH;
+				if ((ip + mlt == vLimit)
+					&& (vLimit < iLimit))
+					mlt += LZ4_count(ip + mlt,
+						base + dictLimit,
+						iLimit);
+				if (mlt > ml) {
+					/* virtual matchpos */
+					ml = mlt;
+					*matchpos = base + matchIndex;
 				}
 			}
 		}
-		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
-	}
-
-	/* Complete table */
-	if (repl) {
-		const BYTE *ptr = ip;
-		const BYTE *end;
-		end = ip + repl - (MINMATCH-1);
-		/* Pre-Load */
-		while (ptr < end - delta) {
-			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
-			ptr++;
-		}
-		do {
-			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
-			/* Head of chain */
-			hashtable[HASH_VALUE(ptr)] = (ptr) - base;
-			ptr++;
-		} while (ptr < end);
-		hc4->nexttoupdate = end;
+		matchIndex -= DELTANEXTU16(matchIndex);
 	}
 
 	return (int)ml;
 }
 
-static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
-	const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
-	const u8 **matchpos, const u8 **startpos)
+static inline int LZ4HC_InsertAndGetWiderMatch(
+	LZ4HC_CCtx_internal *hc4,
+	const BYTE * const ip,
+	const BYTE * const iLowLimit,
+	const BYTE * const iHighLimit,
+	int longest,
+	const BYTE **matchpos,
+	const BYTE **startpos,
+	const int maxNbAttempts)
 {
-	u16 *const chaintable = hc4->chaintable;
-	HTYPE *const hashtable = hc4->hashtable;
-#if LZ4_ARCH64
+	U16 * const chainTable = hc4->chainTable;
+	U32 * const HashTable = hc4->hashTable;
 	const BYTE * const base = hc4->base;
-#else
-	const int base = 0;
-#endif
-	const u8 *ref;
-	int nbattempts = MAX_NB_ATTEMPTS;
-	int delta = (int)(ip - startlimit);
+	const U32 dictLimit = hc4->dictLimit;
+	const BYTE * const lowPrefixPtr = base + dictLimit;
+	const U32 lowLimit = (hc4->lowLimit + 64 * KB > (U32)(ip - base))
+		? hc4->lowLimit
+		: (U32)(ip - base) - (64 * KB - 1);
+	const BYTE * const dictBase = hc4->dictBase;
+	U32	 matchIndex;
+	int nbAttempts = maxNbAttempts;
+	int delta = (int)(ip - iLowLimit);
 
 	/* First Match */
-	lz4hc_insert(hc4, ip);
-	ref = hashtable[HASH_VALUE(ip)] + base;
-
-	while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
-		&& (nbattempts)) {
-		nbattempts--;
-		if (*(startlimit + longest) == *(ref - delta + longest)) {
-			if (A32(ref) == A32(ip)) {
-				const u8 *reft = ref + MINMATCH;
-				const u8 *ipt = ip + MINMATCH;
-				const u8 *startt = ip;
-
-				while (ipt < matchlimit-(STEPSIZE - 1)) {
-					#if LZ4_ARCH64
-					u64 diff = A64(reft) ^ A64(ipt);
-					#else
-					u32 diff = A32(reft) ^ A32(ipt);
-					#endif
-
-					if (!diff) {
-						ipt += STEPSIZE;
-						reft += STEPSIZE;
-						continue;
+	LZ4HC_Insert(hc4, ip);
+	matchIndex = HashTable[LZ4HC_hashPtr(ip)];
+
+	while ((matchIndex >= lowLimit)
+		&& (nbAttempts)) {
+		nbAttempts--;
+		if (matchIndex >= dictLimit) {
+			const BYTE *matchPtr = base + matchIndex;
+
+			if (*(iLowLimit + longest)
+				== *(matchPtr - delta + longest)) {
+				if (LZ4_read32(matchPtr) == LZ4_read32(ip)) {
+					int mlt = MINMATCH + LZ4_count(
+						ip + MINMATCH,
+						matchPtr + MINMATCH,
+						iHighLimit);
+					int back = 0;
+
+					while ((ip + back > iLowLimit)
+						&& (matchPtr + back > lowPrefixPtr)
+						&& (ip[back - 1] == matchPtr[back - 1]))
+						back--;
+
+					mlt -= back;
+
+					if (mlt > longest) {
+						longest = (int)mlt;
+						*matchpos = matchPtr + back;
+						*startpos = ip + back;
 					}
-					ipt += LZ4_NBCOMMONBYTES(diff);
-					goto _endcount;
-				}
-				#if LZ4_ARCH64
-				if ((ipt < (matchlimit - 3))
-					&& (A32(reft) == A32(ipt))) {
-					ipt += 4;
-					reft += 4;
 				}
-				ipt += 2;
-				#endif
-				if ((ipt < (matchlimit - 1))
-					&& (A16(reft) == A16(ipt))) {
-					reft += 2;
-				}
-				if ((ipt < matchlimit) && (*reft == *ipt))
-					ipt++;
-_endcount:
-				reft = ref;
-
-				while ((startt > startlimit)
-					&& (reft > hc4->base)
-					&& (startt[-1] == reft[-1])) {
-					startt--;
-					reft--;
-				}
-
-				if ((ipt - startt) > longest) {
-					longest = (int)(ipt - startt);
-					*matchpos = reft;
-					*startpos = startt;
+			}
+		} else {
+			const BYTE * const matchPtr = dictBase + matchIndex;
+
+			if (LZ4_read32(matchPtr) == LZ4_read32(ip)) {
+				size_t mlt;
+				int back = 0;
+				const BYTE *vLimit = ip + (dictLimit - matchIndex);
+
+				if (vLimit > iHighLimit)
+					vLimit = iHighLimit;
+
+				mlt = LZ4_count(ip + MINMATCH,
+					matchPtr + MINMATCH, vLimit) + MINMATCH;
+
+				if ((ip + mlt == vLimit) && (vLimit < iHighLimit))
+					mlt += LZ4_count(ip + mlt, base + dictLimit,
+						iHighLimit);
+				while ((ip + back > iLowLimit)
+					&& (matchIndex + back > lowLimit)
+					&& (ip[back - 1] == matchPtr[back - 1]))
+					back--;
+
+				mlt -= back;
+
+				if ((int)mlt > longest) {
+					longest = (int)mlt;
+					*matchpos = base + matchIndex + back;
+					*startpos = ip + back;
 				}
 			}
 		}
-		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+
+		matchIndex -= DELTANEXTU16(matchIndex);
 	}
+
 	return longest;
 }
 
-static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
-		int ml, const u8 *ref)
+static inline int LZ4HC_encodeSequence(
+	const BYTE **ip,
+	BYTE **op,
+	const BYTE **anchor,
+	int matchLength,
+	const BYTE * const match,
+	limitedOutput_directive limitedOutputBuffer,
+	BYTE *oend)
 {
-	int length, len;
-	u8 *token;
+	int length;
+	BYTE *token;
 
 	/* Encode Literal length */
 	length = (int)(*ip - *anchor);
 	token = (*op)++;
+
+	if ((limitedOutputBuffer)
+		&& ((*op + (length>>8)
+			+ length + (2 + 1 + LASTLITERALS)) > oend)) {
+		/* Check output limit */
+		return 1;
+	}
 	if (length >= (int)RUN_MASK) {
-		*token = (RUN_MASK << ML_BITS);
+		int len;
+
+		*token = (RUN_MASK<<ML_BITS);
 		len = length - RUN_MASK;
 		for (; len > 254 ; len -= 255)
 			*(*op)++ = 255;
-		*(*op)++ = (u8)len;
+		*(*op)++ = (BYTE)len;
 	} else
-		*token = (length << ML_BITS);
+		*token = (BYTE)(length<<ML_BITS);
 
 	/* Copy Literals */
-	LZ4_BLINDCOPY(*anchor, *op, length);
+	LZ4_wildCopy(*op, *anchor, (*op) + length);
+	*op += length;
 
 	/* Encode Offset */
-	LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref));
+	LZ4_writeLE16(*op, (U16)(*ip - match));
+	*op += 2;
 
 	/* Encode MatchLength */
-	len = (int)(ml - MINMATCH);
-	if (len >= (int)ML_MASK) {
+	length = (int)(matchLength - MINMATCH);
+
+	if ((limitedOutputBuffer)
+		&& (*op + (length>>8)
+			+ (1 + LASTLITERALS) > oend)) {
+		/* Check output limit */
+		return 1;
+	}
+
+	if (length >= (int)ML_MASK) {
 		*token += ML_MASK;
-		len -= ML_MASK;
-		for (; len > 509 ; len -= 510) {
+		length -= ML_MASK;
+
+		for (; length > 509 ; length -= 510) {
 			*(*op)++ = 255;
 			*(*op)++ = 255;
 		}
-		if (len > 254) {
-			len -= 255;
+
+		if (length > 254) {
+			length -= 255;
 			*(*op)++ = 255;
 		}
-		*(*op)++ = (u8)len;
+
+		*(*op)++ = (BYTE)length;
 	} else
-		*token += len;
+		*token += (BYTE)(length);
 
 	/* Prepare next loop */
-	*ip += ml;
+	*ip += matchLength;
 	*anchor = *ip;
 
 	return 0;
 }
 
-static int lz4_compresshcctx(struct lz4hc_data *ctx,
-		const char *source,
-		char *dest,
-		int isize)
+static int LZ4HC_compress_generic(
+	LZ4HC_CCtx_internal *const ctx,
+	const char * const source,
+	char * const dest,
+	int const inputSize,
+	int const maxOutputSize,
+	int compressionLevel,
+	limitedOutput_directive limit
+	)
 {
-	const u8 *ip = (const u8 *)source;
-	const u8 *anchor = ip;
-	const u8 *const iend = ip + isize;
-	const u8 *const mflimit = iend - MFLIMIT;
-	const u8 *const matchlimit = (iend - LASTLITERALS);
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *anchor = ip;
+	const BYTE * const iend = ip + inputSize;
+	const BYTE * const mflimit = iend - MFLIMIT;
+	const BYTE * const matchlimit = (iend - LASTLITERALS);
 
-	u8 *op = (u8 *)dest;
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + maxOutputSize;
 
+	unsigned int maxNbAttempts;
 	int ml, ml2, ml3, ml0;
-	const u8 *ref = NULL;
-	const u8 *start2 = NULL;
-	const u8 *ref2 = NULL;
-	const u8 *start3 = NULL;
-	const u8 *ref3 = NULL;
-	const u8 *start0;
-	const u8 *ref0;
-	int lastrun;
+	const BYTE *ref = NULL;
+	const BYTE *start2 = NULL;
+	const BYTE *ref2 = NULL;
+	const BYTE *start3 = NULL;
+	const BYTE *ref3 = NULL;
+	const BYTE *start0;
+	const BYTE *ref0;
+
+	/* init */
+	if (compressionLevel > LZ4HC_MAX_CLEVEL)
+		compressionLevel = LZ4HC_MAX_CLEVEL;
+	if (compressionLevel < 1)
+		compressionLevel = LZ4HC_DEFAULT_CLEVEL;
+	maxNbAttempts = 1 << (compressionLevel - 1);
+	ctx->end += inputSize;
 
 	ip++;
 
 	/* Main Loop */
 	while (ip < mflimit) {
-		ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
+		ml = LZ4HC_InsertAndFindBestMatch(ctx, ip,
+			matchlimit, (&ref), maxNbAttempts);
 		if (!ml) {
 			ip++;
 			continue;
@@ -351,51 +387,59 @@ static int lz4_compresshcctx(struct lz4hc_data *ctx,
 		start0 = ip;
 		ref0 = ref;
 		ml0 = ml;
-_search2:
-		if (ip+ml < mflimit)
-			ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
-				ip + 1, matchlimit, ml, &ref2, &start2);
+
+_Search2:
+		if (ip + ml < mflimit)
+			ml2 = LZ4HC_InsertAndGetWiderMatch(ctx,
+				ip + ml - 2, ip + 0,
+				matchlimit, ml, &ref2,
+				&start2, maxNbAttempts);
 		else
 			ml2 = ml;
-		/* No better match */
+
 		if (ml2 == ml) {
-			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			/* No better match */
+			if (LZ4HC_encodeSequence(&ip, &op,
+				&anchor, ml, ref, limit, oend))
+				return 0;
 			continue;
 		}
 
 		if (start0 < ip) {
-			/* empirical */
 			if (start2 < ip + ml0) {
+				/* empirical */
 				ip = start0;
 				ref = ref0;
 				ml = ml0;
 			}
 		}
-		/*
-		 * Here, start0==ip
-		 * First Match too small : removed
-		 */
+
+		/* Here, start0 == ip */
 		if ((start2 - ip) < 3) {
+			/* First Match too small : removed */
 			ml = ml2;
 			ip = start2;
 			ref = ref2;
-			goto _search2;
+			goto _Search2;
 		}
 
-_search3:
+_Search3:
 		/*
-		 * Currently we have :
-		 * ml2 > ml1, and
-		 * ip1+3 <= ip2 (usually < ip1+ml1)
-		 */
+		* Currently we have :
+		* ml2 > ml1, and
+		* ip1 + 3 <= ip2 (usually < ip1 + ml1)
+		*/
 		if ((start2 - ip) < OPTIMAL_ML) {
 			int correction;
 			int new_ml = ml;
+
 			if (new_ml > OPTIMAL_ML)
 				new_ml = OPTIMAL_ML;
 			if (ip + new_ml > start2 + ml2 - MINMATCH)
 				new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+
 			correction = new_ml - (int)(start2 - ip);
+
 			if (correction > 0) {
 				start2 += correction;
 				ref2 += correction;
@@ -403,39 +447,44 @@ _search3:
 			}
 		}
 		/*
-		 * Now, we have start2 = ip+new_ml,
-		 * with new_ml=min(ml, OPTIMAL_ML=18)
+		 * Now, we have start2 = ip + new_ml,
+		 * with new_ml = min(ml, OPTIMAL_ML = 18)
 		 */
+
 		if (start2 + ml2 < mflimit)
-			ml3 = lz4hc_insertandgetwidermatch(ctx,
-				start2 + ml2 - 3, start2, matchlimit,
-				ml2, &ref3, &start3);
+			ml3 = LZ4HC_InsertAndGetWiderMatch(ctx,
+				start2 + ml2 - 3, start2,
+				matchlimit, ml2, &ref3, &start3,
+				maxNbAttempts);
 		else
 			ml3 = ml2;
 
-		/* No better match : 2 sequences to encode */
 		if (ml3 == ml2) {
+			/* No better match : 2 sequences to encode */
 			/* ip & ref are known; Now for ml */
-			if (start2 < ip+ml)
+			if (start2 < ip + ml)
 				ml = (int)(start2 - ip);
-
 			/* Now, encode 2 sequences */
-			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			if (LZ4HC_encodeSequence(&ip, &op, &anchor,
+				ml, ref, limit, oend))
+				return 0;
 			ip = start2;
-			lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
+			if (LZ4HC_encodeSequence(&ip, &op, &anchor,
+				ml2, ref2, limit, oend))
+				return 0;
 			continue;
 		}
 
-		/* Not enough space for match 2 : remove it */
 		if (start3 < ip + ml + 3) {
-			/*
-			 * can write Seq1 immediately ==> Seq2 is removed,
-			 * so Seq3 becomes Seq1
-			 */
+			/* Not enough space for match 2 : remove it */
 			if (start3 >= (ip + ml)) {
+				/* can write Seq1 immediately
+				 * ==> Seq2 is removed,
+				 * so Seq3 becomes Seq1
+				 */
 				if (start2 < ip + ml) {
-					int correction =
-						(int)(ip + ml - start2);
+					int correction = (int)(ip + ml - start2);
+
 					start2 += correction;
 					ref2 += correction;
 					ml2 -= correction;
@@ -446,35 +495,38 @@ _search3:
 					}
 				}
 
-				lz4_encodesequence(&ip, &op, &anchor, ml, ref);
-				ip  = start3;
+				if (LZ4HC_encodeSequence(&ip, &op, &anchor,
+					ml, ref, limit, oend))
+					return 0;
+				ip = start3;
 				ref = ref3;
-				ml  = ml3;
+				ml = ml3;
 
 				start0 = start2;
 				ref0 = ref2;
 				ml0 = ml2;
-				goto _search2;
+				goto _Search2;
 			}
 
 			start2 = start3;
 			ref2 = ref3;
 			ml2 = ml3;
-			goto _search3;
+			goto _Search3;
 		}
 
 		/*
-		 * OK, now we have 3 ascending matches; let's write at least
-		 * the first one ip & ref are known; Now for ml
-		 */
+		* OK, now we have 3 ascending matches;
+		* let's write at least the first one
+		* ip & ref are known; Now for ml
+		*/
 		if (start2 < ip + ml) {
 			if ((start2 - ip) < (int)ML_MASK) {
 				int correction;
+
 				if (ml > OPTIMAL_ML)
 					ml = OPTIMAL_ML;
 				if (ip + ml > start2 + ml2 - MINMATCH)
-					ml = (int)(start2 - ip) + ml2
-						- MINMATCH;
+					ml = (int)(start2 - ip) + ml2 - MINMATCH;
 				correction = ml - (int)(start2 - ip);
 				if (correction > 0) {
 					start2 += correction;
@@ -484,7 +536,9 @@ _search3:
 			} else
 				ml = (int)(start2 - ip);
 		}
-		lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+		if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml,
+			ref, limit, oend))
+			return 0;
 
 		ip = start2;
 		ref = ref2;
@@ -494,46 +548,245 @@ _search3:
 		ref2 = ref3;
 		ml2 = ml3;
 
-		goto _search3;
+		goto _Search3;
 	}
 
 	/* Encode Last Literals */
-	lastrun = (int)(iend - anchor);
-	if (lastrun >= (int)RUN_MASK) {
-		*op++ = (RUN_MASK << ML_BITS);
-		lastrun -= RUN_MASK;
-		for (; lastrun > 254 ; lastrun -= 255)
-			*op++ = 255;
-		*op++ = (u8) lastrun;
-	} else
-		*op++ = (lastrun << ML_BITS);
-	memcpy(op, anchor, iend - anchor);
-	op += iend - anchor;
+	{
+		int lastRun = (int)(iend - anchor);
+
+		if ((limit)
+			&& (((char *)op - dest) + lastRun + 1
+				+ ((lastRun + 255 - RUN_MASK)/255)
+					> (U32)maxOutputSize)) {
+			/* Check output limit */
+			return 0;
+		}
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK<<ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254 ; lastRun -= 255)
+				*op++ = 255;
+			*op++ = (BYTE) lastRun;
+		} else
+			*op++ = (BYTE)(lastRun<<ML_BITS);
+		memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
 	/* End */
 	return (int) (((char *)op) - dest);
 }
 
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-			unsigned char *dst, size_t *dst_len, void *wrkmem)
+static int LZ4_compress_HC_extStateHC(
+	void *state,
+	const char *src,
+	char *dst,
+	int srcSize,
+	int maxDstSize,
+	int compressionLevel)
 {
-	int ret = -1;
-	int out_len = 0;
+	LZ4HC_CCtx_internal *ctx = &((LZ4_streamHC_t *)state)->internal_donotuse;
 
-	struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
-	lz4hc_init(hc4, (const u8 *)src);
-	out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
-		(char *)dst, (int)src_len);
+	if (((size_t)(state)&(sizeof(void *) - 1)) != 0) {
+		/* Error : state is not aligned
+		 * for pointers (32 or 64 bits)
+		 */
+		return 0;
+	}
 
-	if (out_len < 0)
-		goto exit;
+	LZ4HC_init(ctx, (const BYTE *)src);
 
-	*dst_len = out_len;
-	return 0;
+	if (maxDstSize < LZ4_compressBound(srcSize))
+		return LZ4HC_compress_generic(ctx, src, dst,
+			srcSize, maxDstSize, compressionLevel, limitedOutput);
+	else
+		return LZ4HC_compress_generic(ctx, src, dst,
+			srcSize, maxDstSize, compressionLevel, noLimit);
+}
+
+int LZ4_compress_HC(const char *src, char *dst, int srcSize,
+	int maxDstSize, int compressionLevel, void *wrkmem)
+{
+	return LZ4_compress_HC_extStateHC(wrkmem, src, dst,
+		srcSize, maxDstSize, compressionLevel);
+}
+EXPORT_SYMBOL(LZ4_compress_HC);
+
+/**************************************
+ *	Streaming Functions
+ **************************************/
+void LZ4_resetStreamHC(LZ4_streamHC_t *LZ4_streamHCPtr, int compressionLevel)
+{
+	LZ4_streamHCPtr->internal_donotuse.base = NULL;
+	LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned int)compressionLevel;
+}
+
+int LZ4_loadDictHC(LZ4_streamHC_t *LZ4_streamHCPtr,
+	const char *dictionary,
+	int dictSize)
+{
+	LZ4HC_CCtx_internal *ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+
+	if (dictSize > 64 * KB) {
+		dictionary += dictSize - 64 * KB;
+		dictSize = 64 * KB;
+	}
+	LZ4HC_init(ctxPtr, (const BYTE *)dictionary);
+	if (dictSize >= 4)
+		LZ4HC_Insert(ctxPtr, (const BYTE *)dictionary + (dictSize - 3));
+	ctxPtr->end = (const BYTE *)dictionary + dictSize;
+	return dictSize;
+}
+EXPORT_SYMBOL(LZ4_loadDictHC);
+
+/* compression */
+
+static void LZ4HC_setExternalDict(
+	LZ4HC_CCtx_internal *ctxPtr,
+	const BYTE *newBlock)
+{
+	if (ctxPtr->end >= ctxPtr->base + 4) {
+		/* Referencing remaining dictionary content */
+		LZ4HC_Insert(ctxPtr, ctxPtr->end - 3);
+	}
+
+	/*
+	 * Only one memory segment for extDict,
+	 * so any previous extDict is lost at this stage
+	 */
+	ctxPtr->lowLimit	= ctxPtr->dictLimit;
+	ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base);
+	ctxPtr->dictBase	= ctxPtr->base;
+	ctxPtr->base = newBlock - ctxPtr->dictLimit;
+	ctxPtr->end	= newBlock;
+	/* match referencing will resume from there */
+	ctxPtr->nextToUpdate = ctxPtr->dictLimit;
+}
+EXPORT_SYMBOL(LZ4HC_setExternalDict);
+
+static int LZ4_compressHC_continue_generic(
+	LZ4_streamHC_t *LZ4_streamHCPtr,
+	const char *source,
+	char *dest,
+	int inputSize,
+	int maxOutputSize,
+	limitedOutput_directive limit)
+{
+	LZ4HC_CCtx_internal *ctxPtr = &LZ4_streamHCPtr->internal_donotuse;
+
+	/* auto - init if forgotten */
+	if (ctxPtr->base == NULL)
+		LZ4HC_init(ctxPtr, (const BYTE *) source);
+
+	/* Check overflow */
+	if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 * GB) {
+		size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base)
+			- ctxPtr->dictLimit;
+		if (dictSize > 64 * KB)
+			dictSize = 64 * KB;
+		LZ4_loadDictHC(LZ4_streamHCPtr,
+			(const char *)(ctxPtr->end) - dictSize, (int)dictSize);
+	}
+
+	/* Check if blocks follow each other */
+	if ((const BYTE *)source != ctxPtr->end)
+		LZ4HC_setExternalDict(ctxPtr, (const BYTE *)source);
+
+	/* Check overlapping input/dictionary space */
+	{
+		const BYTE *sourceEnd = (const BYTE *) source + inputSize;
+		const BYTE * const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit;
+		const BYTE * const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit;
+
+		if ((sourceEnd > dictBegin)
+			&& ((const BYTE *)source < dictEnd)) {
+			if (sourceEnd > dictEnd)
+				sourceEnd = dictEnd;
+			ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase);
+
+			if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4)
+				ctxPtr->lowLimit = ctxPtr->dictLimit;
+		}
+	}
+
+	return LZ4HC_compress_generic(ctxPtr, source, dest,
+		inputSize, maxOutputSize, ctxPtr->compressionLevel, limit);
+}
+
+int LZ4_compress_HC_continue(
+	LZ4_streamHC_t *LZ4_streamHCPtr,
+	const char *source,
+	char *dest,
+	int inputSize,
+	int maxOutputSize)
+{
+	if (maxOutputSize < LZ4_compressBound(inputSize))
+		return LZ4_compressHC_continue_generic(LZ4_streamHCPtr,
+			source, dest, inputSize, maxOutputSize, limitedOutput);
+	else
+		return LZ4_compressHC_continue_generic(LZ4_streamHCPtr,
+			source, dest, inputSize, maxOutputSize, noLimit);
+}
+EXPORT_SYMBOL(LZ4_compress_HC_continue);
+
+/* dictionary saving */
+
+int LZ4_saveDictHC(
+	LZ4_streamHC_t *LZ4_streamHCPtr,
+	char *safeBuffer,
+	int dictSize)
+{
+	LZ4HC_CCtx_internal *const streamPtr = &LZ4_streamHCPtr->internal_donotuse;
+	int const prefixSize = (int)(streamPtr->end
+		- (streamPtr->base + streamPtr->dictLimit));
+
+	if (dictSize > 64 * KB)
+		dictSize = 64 * KB;
+	if (dictSize < 4)
+		dictSize = 0;
+	if (dictSize > prefixSize)
+		dictSize = prefixSize;
+
+	memmove(safeBuffer, streamPtr->end - dictSize, dictSize);
 
-exit:
-	return ret;
+	{
+		U32 const endIndex = (U32)(streamPtr->end - streamPtr->base);
+
+		streamPtr->end = (const BYTE *)safeBuffer + dictSize;
+		streamPtr->base = streamPtr->end - endIndex;
+		streamPtr->dictLimit = endIndex - dictSize;
+		streamPtr->lowLimit = endIndex - dictSize;
+
+		if (streamPtr->nextToUpdate < streamPtr->dictLimit)
+			streamPtr->nextToUpdate = streamPtr->dictLimit;
+	}
+	return dictSize;
+}
+EXPORT_SYMBOL(LZ4_saveDictHC);
+
+/*-******************************
+ *	For backwards compatibility
+ ********************************/
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+	unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	*dst_len = LZ4_compress_HC(src, dst, src_len,
+		*dst_len, LZ4HC_DEFAULT_CLEVEL, wrkmem);
+
+	/*
+	 * Prior lz4hc_compress will return -1 in case of error
+	 * and 0 on success
+	 * while new LZ4_compress_HC
+	 * returns 0 in case of error
+	 * and the output length on success
+	 */
+	if (!*dst_len)
+		return -1;
+	else
+		return 0;
 }
 EXPORT_SYMBOL(lz4hc_compress);
 
 MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("LZ4HC compressor");
+MODULE_DESCRIPTION("LZ4 HC compressor");
-- 
cgit v1.2.3


From 26d3f6eca673dab29a30ecfc5f5655978b765070 Mon Sep 17 00:00:00 2001
From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Date: Wed, 8 Feb 2017 10:27:59 +1100
Subject: lib/lz4: remove back-compat wrappers

Remove the functions introduced as wrappers for providing backwards
compatibility to the prior LZ4 version.  They're not needed anymore since
there's no callers left.

Link: http://lkml.kernel.org/r/1486321748-19085-6-git-send-email-4sschmid@informatik.uni-hamburg.de
Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Bongkyu Kim <bongkyu.kim@lge.com>
Cc: Rui Salvaterra <rsalvaterra@gmail.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/lz4.h      | 69 ------------------------------------------------
 lib/lz4/lz4_compress.c   | 22 ---------------
 lib/lz4/lz4_decompress.c | 42 -----------------------------
 lib/lz4/lz4hc_compress.c | 23 ----------------
 4 files changed, 156 deletions(-)

(limited to 'include')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 1b7ab2a43aed..a3912d7984b5 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -172,18 +172,6 @@ static inline int LZ4_compressBound(size_t isize)
 	return LZ4_COMPRESSBOUND(isize);
 }
 
-/**
- * lz4_compressbound() - For backwards compatibility; see LZ4_compressBound
- * @isize: Size of the input data
- *
- * Return: Max. size LZ4 may output in a "worst case" szenario
- *	(data not compressible)
- */
-static inline int lz4_compressbound(size_t isize)
-{
-	return LZ4_COMPRESSBOUND(isize);
-}
-
 /**
  * LZ4_compress_default() - Compress data from source to dest
  * @source: source address of the original data
@@ -257,20 +245,6 @@ int LZ4_compress_fast(const char *source, char *dest, int inputSize,
 int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr,
 	int targetDestSize, void *wrkmem);
 
-/*
- * lz4_compress() - For backward compatibility, see LZ4_compress_default
- * @src: source address of the original data
- * @src_len: size of the original data
- * @dst: output buffer address of the compressed data. This requires 'dst'
- *	of size LZ4_COMPRESSBOUND
- * @dst_len: is the output size, which is returned after compress done
- * @workmem: address of the working memory.
- *
- * Return: Success if return 0, Error if return < 0
- */
-int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
-	size_t *dst_len, void *wrkmem);
-
 /*-************************************************************************
  *	Decompression Functions
  **************************************************************************/
@@ -346,34 +320,6 @@ int LZ4_decompress_safe(const char *source, char *dest, int compressedSize,
 int LZ4_decompress_safe_partial(const char *source, char *dest,
 	int compressedSize, int targetOutputSize, int maxDecompressedSize);
 
-/*
- * lz4_decompress_unknownoutputsize() - For backwards compatibility,
- *	see LZ4_decompress_safe
- * @src: source address of the compressed data
- * @src_len: is the input size, therefore the compressed size
- * @dest: output buffer address of the decompressed data
- *	which must be already allocated
- * @dest_len: is the max size of the destination buffer, which is
- *	returned with actual size of decompressed data after decompress done
- *
- * Return: Success if return 0, Error if return (< 0)
- */
-int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
-	unsigned char *dest, size_t *dest_len);
-
-/**
- * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast
- * @src: source address of the compressed data
- * @src_len: is the input size, which is returned after decompress done
- * @dest: output buffer address of the decompressed data,
- *	which must be already allocated
- * @actual_dest_len: is the size of uncompressed data, supposing it's known
- *
- * Return: Success if return 0, Error if return (< 0)
- */
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-	unsigned char *dest, size_t actual_dest_len);
-
 /*-************************************************************************
  *	LZ4 HC Compression
  **************************************************************************/
@@ -400,21 +346,6 @@ int lz4_decompress(const unsigned char *src, size_t *src_len,
 int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity,
 	int compressionLevel, void *wrkmem);
 
-/**
- * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC
- * @src: source address of the original data
- * @src_len: size of the original data
- * @dst: output buffer address of the compressed data. This requires 'dst'
- *	of size LZ4_COMPRESSBOUND.
- * @dst_len: is the output size, which is returned after compress done
- * @wrkmem: address of the working memory.
- *	This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
- *
- * Return	: Success if return 0, Error if return (< 0)
- */
-int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
-	size_t *dst_len, void *wrkmem);
-
 /**
  * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure
  * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index 6aa7ac381e1a..697dbda34408 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -883,27 +883,5 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source,
 }
 EXPORT_SYMBOL(LZ4_compress_fast_continue);
 
-/*-******************************
- *	For backwards compatibility
- ********************************/
-int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst,
-	size_t *dst_len, void *wrkmem) {
-	*dst_len = LZ4_compress_default(src, dst, src_len,
-		*dst_len, wrkmem);
-
-	/*
-	 * Prior lz4_compress will return -1 in case of error
-	 * and 0 on success
-	 * while new LZ4_compress_fast/default
-	 * returns 0 in case of error
-	 * and the output length on success
-	 */
-	if (!*dst_len)
-		return -1;
-	else
-		return 0;
-}
-EXPORT_SYMBOL(lz4_compress);
-
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("LZ4 compressor");
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index fd665ca2ab0c..9bf918233749 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -483,47 +483,5 @@ int LZ4_decompress_fast_usingDict(const char *source, char *dest,
 }
 EXPORT_SYMBOL(LZ4_decompress_fast_usingDict);
 
-/*-******************************
- *	For backwards compatibility
- ********************************/
-int lz4_decompress_unknownoutputsize(const unsigned char *src,
-	size_t src_len, unsigned char *dest, size_t *dest_len) {
-	*dest_len = LZ4_decompress_safe(src, dest,
-		src_len, *dest_len);
-
-	/*
-	 * Prior lz4_decompress_unknownoutputsize will return
-	 * 0 for success and a negative result for error
-	 * new LZ4_decompress_safe returns
-	 * - the length of data read on success
-	 * - and also a negative result on error
-	 * meaning when result > 0, we just return 0 here
-	 */
-	if (src_len > 0)
-		return 0;
-	else
-		return -1;
-}
-EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
-
-int lz4_decompress(const unsigned char *src, size_t *src_len,
-	unsigned char *dest, size_t actual_dest_len) {
-	*src_len = LZ4_decompress_fast(src, dest, actual_dest_len);
-
-	/*
-	 * Prior lz4_decompress will return
-	 * 0 for success and a negative result for error
-	 * new LZ4_decompress_fast returns
-	 * - the length of data read on success
-	 * - and also a negative result on error
-	 * meaning when result > 0, we just return 0 here
-	 */
-	if (*src_len > 0)
-		return 0;
-	else
-		return -1;
-}
-EXPORT_SYMBOL(lz4_decompress);
-
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("LZ4 decompressor");
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index f1d4662e59df..836329219181 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -765,28 +765,5 @@ int LZ4_saveDictHC(
 }
 EXPORT_SYMBOL(LZ4_saveDictHC);
 
-/*-******************************
- *	For backwards compatibility
- ********************************/
-int lz4hc_compress(const unsigned char *src, size_t src_len,
-	unsigned char *dst, size_t *dst_len, void *wrkmem)
-{
-	*dst_len = LZ4_compress_HC(src, dst, src_len,
-		*dst_len, LZ4HC_DEFAULT_CLEVEL, wrkmem);
-
-	/*
-	 * Prior lz4hc_compress will return -1 in case of error
-	 * and 0 on success
-	 * while new LZ4_compress_HC
-	 * returns 0 in case of error
-	 * and the output length on success
-	 */
-	if (!*dst_len)
-		return -1;
-	else
-		return 0;
-}
-EXPORT_SYMBOL(lz4hc_compress);
-
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("LZ4 HC compressor");
-- 
cgit v1.2.3


From 1930faf201e83f6c4e6197a3aba30fb7f6c5ef2e Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 8 Feb 2017 10:28:02 +1100
Subject: kprobes: move kprobe declarations to asm-generic/kprobes.h

Often all is needed is these small helpers, instead of compiler.h or a
full kprobes.h.  This is important for asm helpers, in fact even some
asm/kprobes.h make use of these helpers...  instead just keep a generic
asm file with helpers useful for asm code with the least amount of clutter
as possible.

Likewise we need now to also address what to do about this file for both
when architectures have CONFIG_HAVE_KPROBES, and when they do not.  Then
for when architectures have CONFIG_HAVE_KPROBES but have disabled
CONFIG_KPROBES.

Right now most asm/kprobes.h do not have guards against CONFIG_KPROBES,
this means most architecture code cannot include asm/kprobes.h safely.
Correct this and add guards for architectures missing them.  Additionally
provide architectures that not have kprobes support with the default
asm-generic solution.  This lets us force asm/kprobes.h on the header
include/linux/kprobes.h always, but most importantly we can now safely
include just asm/kprobes.h on architecture code without bringing the full
kitchen sink of header files.

Two architectures already provided a guard against CONFIG_KPROBES on its
kprobes.h: sh, arch.  The rest of the architectures needed gaurds added.
We avoid including any not-needed headers on asm/kprobes.h unless kprobes
have been enabled.

In a subsequent atomic change we can try now to remove compiler.h from
include/linux/kprobes.h.

During this sweep I've also identified a few architectures defining a
common macro needed for both kprobes and ftrace, that of the definition of
the breakput instruction up.  Some refer to this as
BREAKPOINT_INSTRUCTION.  This must be kept outside of the #ifdef
CONFIG_KPROBES guard.

Link: http://lkml.kernel.org/r/20170203233139.32682-1-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS                            |  1 +
 arch/alpha/include/asm/Kbuild          |  1 +
 arch/arc/include/asm/kprobes.h         |  6 ++++--
 arch/arm/include/asm/kprobes.h         |  4 ++++
 arch/arm/probes/decode.h               |  1 +
 arch/arm64/include/asm/kprobes.h       |  4 ++++
 arch/arm64/kernel/insn.c               |  1 +
 arch/arm64/kernel/probes/decode-insn.h |  2 ++
 arch/avr32/include/asm/kprobes.h       |  7 ++++++-
 arch/blackfin/include/asm/Kbuild       |  1 +
 arch/c6x/include/asm/Kbuild            |  1 +
 arch/cris/include/asm/Kbuild           |  1 +
 arch/frv/include/asm/Kbuild            |  1 +
 arch/h8300/include/asm/Kbuild          |  1 +
 arch/hexagon/include/asm/Kbuild        |  1 +
 arch/ia64/include/asm/kprobes.h        | 12 +++++++++---
 arch/m32r/include/asm/Kbuild           |  1 +
 arch/m68k/include/asm/Kbuild           |  1 +
 arch/metag/include/asm/Kbuild          |  1 +
 arch/microblaze/include/asm/Kbuild     |  1 +
 arch/mips/include/asm/kprobes.h        |  6 +++++-
 arch/mn10300/include/asm/kprobes.h     |  7 ++++++-
 arch/nios2/include/asm/Kbuild          |  1 +
 arch/openrisc/include/asm/Kbuild       |  1 +
 arch/parisc/include/asm/Kbuild         |  1 +
 arch/powerpc/include/asm/kprobes.h     |  3 +++
 arch/s390/include/asm/kprobes.h        |  7 ++++++-
 arch/score/include/asm/Kbuild          |  1 +
 arch/sh/include/asm/kprobes.h          |  5 ++++-
 arch/sparc/include/asm/kprobes.h       | 10 ++++++++--
 arch/tile/include/asm/kprobes.h        |  6 +++++-
 arch/um/include/asm/Kbuild             |  1 +
 arch/unicore32/include/asm/Kbuild      |  1 +
 arch/x86/include/asm/kprobes.h         |  9 ++++++++-
 arch/xtensa/include/asm/Kbuild         |  1 +
 include/asm-generic/kprobes.h          | 25 +++++++++++++++++++++++++
 include/linux/compiler.h               |  8 --------
 include/linux/kprobes.h                | 19 +++----------------
 38 files changed, 123 insertions(+), 38 deletions(-)
 create mode 100644 include/asm-generic/kprobes.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5b01704315f4..93ea921f0bf2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7200,6 +7200,7 @@ M:	Masami Hiramatsu <mhiramat@kernel.org>
 S:	Maintained
 F:	Documentation/kprobes.txt
 F:	include/linux/kprobes.h
+F:	include/asm-generic/kprobes.h
 F:	kernel/kprobes.c
 
 KS0108 LCD CONTROLLER DRIVER
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 4c5468b03e45..340d500501b7 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += current.h
+generic-y += kprobes.h
diff --git a/arch/arc/include/asm/kprobes.h b/arch/arc/include/asm/kprobes.h
index 944dbedb38b5..00bdbe167615 100644
--- a/arch/arc/include/asm/kprobes.h
+++ b/arch/arc/include/asm/kprobes.h
@@ -9,6 +9,8 @@
 #ifndef _ARC_KPROBES_H
 #define _ARC_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
 #ifdef CONFIG_KPROBES
 
 typedef u16 kprobe_opcode_t;
@@ -55,6 +57,6 @@ void trap_is_kprobe(unsigned long address, struct pt_regs *regs);
 static void trap_is_kprobe(unsigned long address, struct pt_regs *regs)
 {
 }
-#endif
+#endif /* CONFIG_KPROBES */
 
-#endif
+#endif /* _ARC_KPROBES_H */
diff --git a/arch/arm/include/asm/kprobes.h b/arch/arm/include/asm/kprobes.h
index 3ea9be559726..59655459da59 100644
--- a/arch/arm/include/asm/kprobes.h
+++ b/arch/arm/include/asm/kprobes.h
@@ -16,6 +16,9 @@
 #ifndef _ARM_KPROBES_H
 #define _ARM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/notifier.h>
@@ -83,4 +86,5 @@ struct arch_optimized_insn {
 	 */
 };
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ARM_KPROBES_H */
diff --git a/arch/arm/probes/decode.h b/arch/arm/probes/decode.h
index f9b08ba7fe73..548d622a3159 100644
--- a/arch/arm/probes/decode.h
+++ b/arch/arm/probes/decode.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <asm/probes.h>
+#include <asm/kprobes.h>
 
 void __init arm_probes_decode_init(void);
 
diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
index 1737aecfcc5e..6deb8d726041 100644
--- a/arch/arm64/include/asm/kprobes.h
+++ b/arch/arm64/include/asm/kprobes.h
@@ -16,6 +16,9 @@
 #ifndef _ARM_KPROBES_H
 #define _ARM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -57,4 +60,5 @@ int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr);
 void kretprobe_trampoline(void);
 void __kprobes *trampoline_probe_handler(struct pt_regs *regs);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ARM_KPROBES_H */
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 94b62c1fa4df..ae912304ed84 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -31,6 +31,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/fixmap.h>
 #include <asm/insn.h>
+#include <asm/kprobes.h>
 
 #define AARCH64_INSN_SF_BIT	BIT(31)
 #define AARCH64_INSN_N_BIT	BIT(22)
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h
index 76d3f315407f..192ab007bacb 100644
--- a/arch/arm64/kernel/probes/decode-insn.h
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -16,6 +16,8 @@
 #ifndef _ARM_KERNEL_KPROBES_ARM64_H
 #define _ARM_KERNEL_KPROBES_ARM64_H
 
+#include <asm/kprobes.h>
+
 /*
  * ARM strongly recommends a limit of 128 bytes between LoadExcl and
  * StoreExcl instructions in a single thread of execution. So keep the
diff --git a/arch/avr32/include/asm/kprobes.h b/arch/avr32/include/asm/kprobes.h
index 45f563ed73fd..28dfc61ad384 100644
--- a/arch/avr32/include/asm/kprobes.h
+++ b/arch/avr32/include/asm/kprobes.h
@@ -11,10 +11,14 @@
 #ifndef __ASM_AVR32_KPROBES_H
 #define __ASM_AVR32_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xd673	/* breakpoint */
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 
 typedef u16	kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xd673	/* breakpoint */
 #define MAX_INSN_SIZE		2
 #define MAX_STACK_SIZE		64	/* 32 would probably be OK */
 
@@ -46,4 +50,5 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 
 #define flush_insn_slot(p)	do { } while (0)
 
+#endif /* CONFIG_KPROBES */
 #endif /* __ASM_AVR32_KPROBES_H */
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index 2fb67b59d188..acb48e17d76b 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -47,3 +47,4 @@ generic-y += unaligned.h
 generic-y += user.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index 64465e7e2245..47383d1545fc 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -62,3 +62,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index 0568dbe224b8..5e320f660c3c 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -46,3 +46,4 @@ generic-y += types.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index 1fa084cf1a43..f0ff107744ad 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -8,3 +8,4 @@ generic-y += mm-arch-hooks.h
 generic-y += preempt.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 373cb23301e3..9280f110013d 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -75,3 +75,4 @@ generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index db8ddabc6bd2..08d5e7ce820d 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -60,3 +60,4 @@ generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/ia64/include/asm/kprobes.h b/arch/ia64/include/asm/kprobes.h
index d5505d6f2382..0302b3664789 100644
--- a/arch/ia64/include/asm/kprobes.h
+++ b/arch/ia64/include/asm/kprobes.h
@@ -23,14 +23,19 @@
  * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
  *              <anil.s.keshavamurthy@intel.com> adapted from i386
  */
+#include <asm-generic/kprobes.h>
+#include <asm/break.h>
+
+#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
+
+#ifdef CONFIG_KPROBES
+
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
-#include <asm/break.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 #define MAX_INSN_SIZE   2	/* last half is for kprobe-booster */
-#define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
 #define NOP_M_INST	(long)(1<<27)
 #define BRL_INST(i1, i2) ((long)((0xcL << 37) |	/* brl */ \
 				(0x1L << 12) |	/* many */ \
@@ -124,4 +129,5 @@ extern void invalidate_stacked_regs(void);
 extern void flush_register_stack(void);
 extern void arch_remove_kprobe(struct kprobe *p);
 
-#endif				/* _ASM_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_KPROBES_H */
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 6e69d2c42eee..30ee92ff0244 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -12,3 +12,4 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index 1f2e5d31cb24..f2deebe70ce9 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -34,3 +34,4 @@ generic-y += trace_clock.h
 generic-y += types.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 167150c701d1..f254f925b78f 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -55,3 +55,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index b0ae88c9fed9..9e2101ddae20 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -11,3 +11,4 @@ generic-y += preempt.h
 generic-y += syscalls.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index daba1f9a4f79..291846d9ba83 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -22,6 +22,9 @@
 #ifndef _ASM_KPROBES_H
 #define _ASM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
 #include <linux/ptrace.h>
 #include <linux/types.h>
 
@@ -94,4 +97,5 @@ struct kprobe_ctlblk {
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 
-#endif				/* _ASM_KPROBES_H */
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_KPROBES_H */
diff --git a/arch/mn10300/include/asm/kprobes.h b/arch/mn10300/include/asm/kprobes.h
index c800b590183a..7abea0bdb549 100644
--- a/arch/mn10300/include/asm/kprobes.h
+++ b/arch/mn10300/include/asm/kprobes.h
@@ -21,13 +21,17 @@
 #ifndef _ASM_KPROBES_H
 #define _ASM_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xff
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
 struct kprobe;
 
 typedef unsigned char kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xff
 #define MAX_INSN_SIZE 8
 #define MAX_STACK_SIZE 128
 
@@ -47,4 +51,5 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 
 extern void arch_remove_kprobe(struct kprobe *p);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_KPROBES_H */
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild
index d63330e88379..fc31395dfcd9 100644
--- a/arch/nios2/include/asm/Kbuild
+++ b/arch/nios2/include/asm/Kbuild
@@ -63,3 +63,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index 2832f031fb11..c77eadec7e14 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -71,3 +71,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index a07448dc2b29..7ac070267672 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -29,3 +29,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 97b8c1f83453..c7ee3fcbd6e2 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -1,5 +1,8 @@
 #ifndef _ASM_POWERPC_KPROBES_H
 #define _ASM_POWERPC_KPROBES_H
+
+#include <asm-generic/kprobes.h>
+
 #ifdef __KERNEL__
 /*
  *  Kernel Probes (KProbes)
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 591e5a5279b0..84c0f9086483 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -27,6 +27,11 @@
  * 2005-Dec	Used as a template for s390 by Mike Grundy
  *		<grundym@us.ibm.com>
  */
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0x0002
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -37,7 +42,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u16 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0x0002
 
 /* Maximum instruction size is 3 (16bit) halfwords: */
 #define MAX_INSN_SIZE		0x0003
@@ -91,4 +95,5 @@ int probe_is_insn_relative_long(u16 *insn);
 
 #define flush_insn_slot(p)	do { } while (0)
 
+#endif /* CONFIG_KPROBES */
 #endif	/* _ASM_S390_KPROBES_H */
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index a68eb2ec98a1..620970f837bc 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -15,3 +15,4 @@ generic-y += trace_clock.h
 generic-y += xor.h
 generic-y += serial.h
 generic-y += word-at-a-time.h
+generic-y += kprobes.h
diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h
index 134f3980e44a..f0986f9b3844 100644
--- a/arch/sh/include/asm/kprobes.h
+++ b/arch/sh/include/asm/kprobes.h
@@ -1,13 +1,16 @@
 #ifndef __ASM_SH_KPROBES_H
 #define __ASM_SH_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xc33a
+
 #ifdef CONFIG_KPROBES
 
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
 typedef insn_size_t kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xc33a
 
 #define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
diff --git a/arch/sparc/include/asm/kprobes.h b/arch/sparc/include/asm/kprobes.h
index a145d798e112..49f8402035d7 100644
--- a/arch/sparc/include/asm/kprobes.h
+++ b/arch/sparc/include/asm/kprobes.h
@@ -1,13 +1,17 @@
 #ifndef _SPARC64_KPROBES_H
 #define _SPARC64_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION   0x91d02070 /* ta 0x70 */
+#define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/percpu.h>
 
 typedef u32 kprobe_opcode_t;
 
-#define BREAKPOINT_INSTRUCTION   0x91d02070 /* ta 0x70 */
-#define BREAKPOINT_INSTRUCTION_2 0x91d02071 /* ta 0x71 */
 #define MAX_INSN_SIZE 2
 
 #define kretprobe_blacklist_size 0
@@ -48,4 +52,6 @@ int kprobe_exceptions_notify(struct notifier_block *self,
 int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 asmlinkage void __kprobes kprobe_trap(unsigned long trap_level,
 				      struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _SPARC64_KPROBES_H */
diff --git a/arch/tile/include/asm/kprobes.h b/arch/tile/include/asm/kprobes.h
index d8f9a83943b1..4a8b1cadca24 100644
--- a/arch/tile/include/asm/kprobes.h
+++ b/arch/tile/include/asm/kprobes.h
@@ -17,10 +17,13 @@
 #ifndef _ASM_TILE_KPROBES_H
 #define _ASM_TILE_KPROBES_H
 
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
+
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
-
 #include <arch/opcode.h>
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -76,4 +79,5 @@ void arch_remove_kprobe(struct kprobe *);
 extern int kprobe_exceptions_notify(struct notifier_block *self,
 			     unsigned long val, void *data);
 
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_TILE_KPROBES_H */
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 052f7f6d0551..0578e09d7f5a 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -26,3 +26,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 256c45b3ae34..d81fa51a2bf9 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -64,3 +64,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index d1d1e5094c28..200581691c6e 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -21,6 +21,12 @@
  *
  * See arch/x86/kernel/kprobes.c for x86 kprobes history.
  */
+
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION	0xcc
+
+#ifdef CONFIG_KPROBES
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
@@ -32,7 +38,6 @@ struct pt_regs;
 struct kprobe;
 
 typedef u8 kprobe_opcode_t;
-#define BREAKPOINT_INSTRUCTION	0xcc
 #define RELATIVEJUMP_OPCODE 0xe9
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
@@ -116,4 +121,6 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 				    unsigned long val, void *data);
 extern int kprobe_int3_handler(struct pt_regs *regs);
 extern int kprobe_debug_handler(struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
 #endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index b7fbaa56b51a..1d1f610a138c 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -32,3 +32,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
+generic-y += kprobes.h
diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h
new file mode 100644
index 000000000000..57af9f21d148
--- /dev/null
+++ b/include/asm-generic/kprobes.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_GENERIC_KPROBES_H
+#define _ASM_GENERIC_KPROBES_H
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_KPROBES
+/*
+ * Blacklist ganerating macro. Specify functions which is not probed
+ * by using this macro.
+ */
+# define __NOKPROBE_SYMBOL(fname)				\
+static unsigned long __used					\
+	__attribute__((__section__("_kprobe_blacklist")))	\
+	_kbl_addr_##fname = (unsigned long)fname;
+# define NOKPROBE_SYMBOL(fname)	__NOKPROBE_SYMBOL(fname)
+/* Use this to forbid a kprobes attach on very low level functions */
+# define __kprobes	__attribute__((__section__(".kprobes.text")))
+# define nokprobe_inline	__always_inline
+#else
+# define NOKPROBE_SYMBOL(fname)
+# define __kprobes
+# define nokprobe_inline	inline
+#endif
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+
+#endif /* _ASM_GENERIC_KPROBES_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index cf0fa5d86059..dca49b4099da 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -566,12 +566,4 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	(_________p1); \
 })
 
-/* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
-#ifdef CONFIG_KPROBES
-# define __kprobes	__attribute__((__section__(".kprobes.text")))
-# define nokprobe_inline	__always_inline
-#else
-# define __kprobes
-# define nokprobe_inline	inline
-#endif
 #endif /* __LINUX_COMPILER_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 8f6849084248..985ed79ef431 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,7 +29,7 @@
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  */
-#include <linux/compiler.h>	/* for __kprobes */
+#include <linux/compiler.h>
 #include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
@@ -40,9 +40,9 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <asm/kprobes.h>
 
 #ifdef CONFIG_KPROBES
-#include <asm/kprobes.h>
 
 /* kprobe_status settings */
 #define KPROBE_HIT_ACTIVE	0x00000001
@@ -51,6 +51,7 @@
 #define KPROBE_HIT_SSDONE	0x00000008
 
 #else /* CONFIG_KPROBES */
+#include <asm-generic/kprobes.h>
 typedef int kprobe_opcode_t;
 struct arch_specific_insn {
 	int dummy;
@@ -481,18 +482,4 @@ static inline int enable_jprobe(struct jprobe *jp)
 	return enable_kprobe(&jp->kp);
 }
 
-#ifdef CONFIG_KPROBES
-/*
- * Blacklist ganerating macro. Specify functions which is not probed
- * by using this macro.
- */
-#define __NOKPROBE_SYMBOL(fname)			\
-static unsigned long __used				\
-	__attribute__((section("_kprobe_blacklist")))	\
-	_kbl_addr_##fname = (unsigned long)fname;
-#define NOKPROBE_SYMBOL(fname)	__NOKPROBE_SYMBOL(fname)
-#else
-#define NOKPROBE_SYMBOL(fname)
-#endif
-
 #endif /* _LINUX_KPROBES_H */
-- 
cgit v1.2.3


From 5763eb2eeb9f33689f5c1a7725fa74e08369b570 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Wed, 8 Feb 2017 10:28:03 +1100
Subject: autofs: remove wrong comment

This format seems to have been taken from device mapper header, but autofs
has no such file:function in both kernel and userspace.

Link: http://lkml.kernel.org/r/148577164094.9801.4775075118014742496.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/auto_dev-ioctl.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 021ed331dd71..388739b429b0 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -120,10 +120,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
 	in->ioctlfd = -1;
 }
 
-/*
- * If you change this make sure you make the corresponding change
- * to autofs-dev-ioctl.c:lookup_ioctl()
- */
 enum {
 	/* Get various version info */
 	AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
-- 
cgit v1.2.3


From 537e165ff26cca75cff9de2f9b9d6cc4a4cece5b Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Wed, 8 Feb 2017 10:28:08 +1100
Subject: autofs: add command enum/macros for root-dir ioctls

Sync root-dir ioctl with misc-char-dev ioctl's enum/macro format since
these two types of ioctls aren't completely independent of each other in
terms of command nr.  No functional changes.

Link: http://lkml.kernel.org/r/148577166143.9801.15511796506678428145.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/auto_dev-ioctl.h |  2 --
 include/uapi/linux/auto_fs.h        | 25 ++++++++++++++++++-------
 include/uapi/linux/auto_fs4.h       | 16 +++++++++++-----
 3 files changed, 29 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 388739b429b0..af107aa7b685 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -156,8 +156,6 @@ enum {
 	AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
 };
 
-#define AUTOFS_IOCTL 0x93
-
 #define AUTOFS_DEV_IOCTL_VERSION \
 	_IOWR(AUTOFS_IOCTL, \
 	      AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 1bfc3ed8b284..aa63451ef20a 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -61,12 +61,23 @@ struct autofs_packet_expire {
 	char name[NAME_MAX+1];
 };
 
-#define AUTOFS_IOC_READY      _IO(0x93, 0x60)
-#define AUTOFS_IOC_FAIL       _IO(0x93, 0x61)
-#define AUTOFS_IOC_CATATONIC  _IO(0x93, 0x62)
-#define AUTOFS_IOC_PROTOVER   _IOR(0x93, 0x63, int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
-#define AUTOFS_IOC_EXPIRE     _IOR(0x93, 0x65, struct autofs_packet_expire)
+#define AUTOFS_IOCTL 0x93
+
+enum {
+	AUTOFS_IOC_READY_CMD = 0x60,
+	AUTOFS_IOC_FAIL_CMD,
+	AUTOFS_IOC_CATATONIC_CMD,
+	AUTOFS_IOC_PROTOVER_CMD,
+	AUTOFS_IOC_SETTIMEOUT_CMD,
+	AUTOFS_IOC_EXPIRE_CMD,
+};
+
+#define AUTOFS_IOC_READY        _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD)
+#define AUTOFS_IOC_FAIL         _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD)
+#define AUTOFS_IOC_CATATONIC    _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD)
+#define AUTOFS_IOC_PROTOVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOVER_CMD, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT   _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, unsigned long)
+#define AUTOFS_IOC_EXPIRE       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_CMD, struct autofs_packet_expire)
 
 #endif /* _UAPI_LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 8f8f1bdcca8c..7c6da423d54e 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -148,10 +148,16 @@ union autofs_v5_packet_union {
 	autofs_packet_expire_direct_t expire_direct;
 };
 
-#define AUTOFS_IOC_EXPIRE_MULTI		_IOW(0x93, 0x66, int)
-#define AUTOFS_IOC_EXPIRE_INDIRECT	AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_EXPIRE_DIRECT	AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93, 0x67, int)
-#define AUTOFS_IOC_ASKUMOUNT		_IOR(0x93, 0x70, int)
+enum {
+	AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */
+	AUTOFS_IOC_PROTOSUBVER_CMD,
+	AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */
+};
+
+#define AUTOFS_IOC_EXPIRE_MULTI    _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
+#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_EXPIRE_DIRECT   AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_PROTOSUBVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
+#define AUTOFS_IOC_ASKUMOUNT       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
 
 #endif /* _LINUX_AUTO_FS4_H */
-- 
cgit v1.2.3


From 7bd153cf787caeff177313de4d5d388c5a83ed99 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Wed, 8 Feb 2017 10:28:09 +1100
Subject: autofs: remove duplicated AUTOFS_DEV_IOCTL_SIZE definition

This macro is already defined in uapi header.  Also use this macro where
possible.

Link: http://lkml.kernel.org/r/148577166656.9801.10322423666945951186.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c              | 2 --
 include/uapi/linux/auto_dev-ioctl.h | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 6f48d670c941..806df746f1a9 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -38,8 +38,6 @@
  * which have been left busy at at service shutdown.
  */
 
-#define AUTOFS_DEV_IOCTL_SIZE	sizeof(struct autofs_dev_ioctl)
-
 typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
 			struct autofs_dev_ioctl *);
 
diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index af107aa7b685..744b3d060968 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -113,10 +113,10 @@ struct autofs_dev_ioctl {
 
 static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
 {
-	memset(in, 0, sizeof(struct autofs_dev_ioctl));
+	memset(in, 0, AUTOFS_DEV_IOCTL_SIZE);
 	in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
 	in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
-	in->size = sizeof(struct autofs_dev_ioctl);
+	in->size = AUTOFS_DEV_IOCTL_SIZE;
 	in->ioctlfd = -1;
 }
 
-- 
cgit v1.2.3


From 80db060f313a344f2abadab84cad7647237d902e Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@list.ru>
Date: Wed, 8 Feb 2017 10:28:14 +1100
Subject: sigaltstack: support SS_AUTODISARM for CONFIG_COMPAT

Currently SS_AUTODISARM is not supported in compatibility mode, but does
not return -EINVAL either.  This makes dosemu built with -m32 on x86_64 to
crash.  Also the kernel's sigaltstack selftest fails if compiled with
-m32.

This patch adds the needed support.

Link: http://lkml.kernel.org/r/20170205101213.8163-2-stsp@list.ru
Signed-off-by: Stas Sergeev <stsp@users.sourceforge.net>
Cc: Milosz Tanski <milosz@adfin.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Waiman Long <Waiman.Long@hpe.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compat.h |  4 +++-
 kernel/signal.c        | 11 +++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 63609398ef9f..d8535a430caf 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -711,8 +711,10 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 	compat_stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
 	put_user_ex(ptr_to_compat((void __user *)t->sas_ss_sp), &__uss->ss_sp); \
-	put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
 	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	if (t->sas_ss_flags & SS_AUTODISARM) \
+		sas_ss_reset(t); \
 } while (0);
 
 asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
diff --git a/kernel/signal.c b/kernel/signal.c
index 3603d93a1968..0f99304d39fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3239,10 +3239,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
 
 int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
 {
+	int err;
 	struct task_struct *t = current;
-	return  __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
-		__put_user(sas_ss_flags(sp), &uss->ss_flags) |
+	err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+			 &uss->ss_sp) |
+		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
+	if (err)
+		return err;
+	if (t->sas_ss_flags & SS_AUTODISARM)
+		sas_ss_reset(t);
+	return 0;
 }
 #endif
 
-- 
cgit v1.2.3


From 9d1c362d9f21ed70425d28e4db964bdea492291f Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 8 Feb 2017 10:28:15 +1100
Subject: kdump, vmcoreinfo: report actual value of phys_base

Currently, VMCOREINFO note information reports the virtual address of
phys_base that is assigned to symbol phys_base.  But this doesn't make
sense because to refer to phys_base, it's necessary to get the value of
phys_base itself we are now about to refer to.

Userland tools related to kdump such as makedumpfile and crash utility so
far have made some efforts to calculate phys_base on crash dump formats
generated by mechanisms running outside Linux kernel, such as virtual
machine hypervisor such as qemu dump, which ordinary users use via virsh
dump, or ones implemented on vendor specific firmware.

That is, find a kernel data whose virtual and physical addresses are
available via its note information and calculate phys_base from it.
However, such data structure is not the one prepared for phys_base
purpose.  There's no guarantee that other crash dump mechanisms include
such information that can be used to calculate phys_base similarly.

To get VMCOREINFO in vmcore, it's easy to use strings and grep commands
like this; VMCOREINFO consists of simple string:

$ strings vmcore-3.10.0-121.el7.x86_64 | grep -E ".*VMCOREINFO.*" -A 100
VMCOREINFO
OSRELEASE=3.10.0-121.el7.x86_64
PAGESIZE=4096
...

This is also useful to get value of phys_base in kdump 2nd kernel
contained in vmcore using the above-mentioned external crash dump
mechanism; kdump 2nd kernel is an inherently relocated kernel.

This commit doesn't remove VMCOREINFO_SYMBOL(phys_base) line because
makedumpfile refers to it and if removing it, old versions makedumpfile
doesn't work well.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Dave Anderson <anderson@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/machine_kexec_64.c | 1 +
 include/linux/kexec.h              | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 307b1f4543de..2e3c34b1df37 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -338,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void)
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
 			      kaslr_offset());
 	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+	VMCOREINFO_PHYS_BASE(phys_base);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..e98e546b543c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -283,6 +283,8 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
 #define VMCOREINFO_CONFIG(name) \
 	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+	vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
-- 
cgit v1.2.3


From 591941542cb1c6b686247f34258f91b2f57f07cc Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 8 Feb 2017 10:28:16 +1100
Subject: include/linux/pid.h: use for_each_thread() in do_each_pid_thread()

while_each_pid_thread() is using while_each_thread(), which is unsafe
under RCU lock according to commit 0c740d0afc3bff0a ("introduce
for_each_thread() to replace the buggy while_each_thread()").  Use
for_each_thread() in do_each_pid_thread() which is safe under RCU lock.

Link: http://lkml.kernel.org/r/201702011947.DBD56740.OMVHOLOtSJFFFQ@I-love.SAKURA.ne.jp
Link: http://lkml.kernel.org/r/1486041779-4401-2-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 23705a53abba..298ead5512e5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -191,10 +191,10 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_thread(pid, type, task)				\
 	do_each_pid_task(pid, type, task) {				\
 		struct task_struct *tg___ = task;			\
-		do {
+		for_each_thread(tg___, task) {
 
 #define while_each_pid_thread(pid, type, task)				\
-		} while_each_thread(tg___, task);			\
+		}							\
 		task = tg___;						\
 	} while_each_pid_task(pid, type, task)
 #endif /* _LINUX_PID_H */
-- 
cgit v1.2.3


From aed672c63c2edc0004d8c1d14794d30f8b9f938c Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 8 Feb 2017 10:28:23 +1100
Subject: ipc/sem: add hysteresis

sysv sem has two lock modes: One with per-semaphore locks, one lock mode
with a single global lock for the whole array.
When switching from the per-semaphore locks to the global lock, all
per-semaphore locks must be scanned for ongoing operations.

The patch adds a hysteresis for switching from the global lock to the
per semaphore locks. This reduces how often the per-semaphore locks
must be scanned.

Compared to the initial patch, this is a simplified solution:
Setting USE_GLOBAL_LOCK_HYSTERESIS to 1 restores the current behavior.

In theory, a workload with exactly 10 simple sops and then
one complex op now scales a bit worse, but this is pure theory:
If there is concurrency, the it won't be exactly 10:1:10:1:10:1:...
If there is no concurrency, then there is no need for scalability.

Link: http://lkml.kernel.org/r/1476851896-3590-3-git-send-email-manfred@colorfullife.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: <1vier1@web.de>
Cc: kernel test robot <xiaolong.ye@intel.com>
Cc: <felixh@informatik.uni-bremen.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sem.h |  2 +-
 ipc/sem.c           | 86 +++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index d0efd6e6c20a..4fc222f8755d 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,7 +21,7 @@ struct sem_array {
 	struct list_head	list_id;	/* undo requests on this array */
 	int			sem_nsems;	/* no. of semaphores in array */
 	int			complex_count;	/* pending complex operations */
-	bool			complex_mode;	/* no parallel simple ops */
+	unsigned int		use_global_lock;/* >0: global lock required */
 };
 
 #ifdef CONFIG_SYSVIPC
diff --git a/ipc/sem.c b/ipc/sem.c
index fe5db1ed081b..e468cd1c12f0 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -158,23 +158,43 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define SEMMSL_FAST	256 /* 512 bytes on stack */
 #define SEMOPM_FAST	64  /* ~ 372 bytes on stack */
 
+/*
+ * Switching from the mode suitable for simple ops
+ * to the mode for complex ops is costly. Therefore:
+ * use some hysteresis
+ */
+#define USE_GLOBAL_LOCK_HYSTERESIS	10
+
 /*
  * Locking:
  * a) global sem_lock() for read/write
  *	sem_undo.id_next,
  *	sem_array.complex_count,
- *	sem_array.complex_mode
  *	sem_array.pending{_alter,_const},
  *	sem_array.sem_undo
  *
  * b) global or semaphore sem_lock() for read/write:
  *	sem_array.sem_base[i].pending_{const,alter}:
- *	sem_array.complex_mode (for read)
  *
  * c) special:
  *	sem_undo_list.list_proc:
  *	* undo_list->lock for write
  *	* rcu for read
+ *	use_global_lock:
+ *	* global sem_lock() for write
+ *	* either local or global sem_lock() for read.
+ *
+ * Memory ordering:
+ * Most ordering is enforced by using spin_lock() and spin_unlock().
+ * The special case is use_global_lock:
+ * Setting it from non-zero to 0 is a RELEASE, this is ensured by
+ * using smp_store_release().
+ * Testing if it is non-zero is an ACQUIRE, this is ensured by using
+ * smp_load_acquire().
+ * Setting it from 0 to non-zero must be ordered with regards to
+ * this smp_load_acquire(), this is guaranteed because the smp_load_acquire()
+ * is inside a spin_lock() and after a write from 0 to non-zero a
+ * spin_lock()+spin_unlock() is done.
  */
 
 #define sc_semmsl	sem_ctls[0]
@@ -273,12 +293,16 @@ static void complexmode_enter(struct sem_array *sma)
 	int i;
 	struct sem *sem;
 
-	if (sma->complex_mode)  {
-		/* We are already in complex_mode. Nothing to do */
+	if (sma->use_global_lock > 0)  {
+		/*
+		 * We are already in global lock mode.
+		 * Nothing to do, just reset the
+		 * counter until we return to simple mode.
+		 */
+		sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 		return;
 	}
-
-	sma->complex_mode = true;
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 
 	for (i = 0; i < sma->sem_nsems; i++) {
 		sem = sma->sem_base + i;
@@ -299,13 +323,17 @@ static void complexmode_tryleave(struct sem_array *sma)
 		 */
 		return;
 	}
-	/*
-	 * Immediately after setting complex_mode to false,
-	 * a simple op can start. Thus: all memory writes
-	 * performed by the current operation must be visible
-	 * before we set complex_mode to false.
-	 */
-	smp_store_release(&sma->complex_mode, false);
+	if (sma->use_global_lock == 1) {
+		/*
+		 * Immediately after setting use_global_lock to 0,
+		 * a simple op can start. Thus: all memory writes
+		 * performed by the current operation must be visible
+		 * before we set use_global_lock to 0.
+		 */
+		smp_store_release(&sma->use_global_lock, 0);
+	} else {
+		sma->use_global_lock--;
+	}
 }
 
 #define SEM_GLOBAL_LOCK	(-1)
@@ -335,22 +363,23 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	 * Optimized locking is possible if no complex operation
 	 * is either enqueued or processed right now.
 	 *
-	 * Both facts are tracked by complex_mode.
+	 * Both facts are tracked by use_global_mode.
 	 */
 	sem = sma->sem_base + sops->sem_num;
 
 	/*
-	 * Initial check for complex_mode. Just an optimization,
+	 * Initial check for use_global_lock. Just an optimization,
 	 * no locking, no memory barrier.
 	 */
-	if (!sma->complex_mode) {
+	if (!sma->use_global_lock) {
 		/*
 		 * It appears that no complex operation is around.
 		 * Acquire the per-semaphore lock.
 		 */
 		spin_lock(&sem->lock);
 
-		if (!smp_load_acquire(&sma->complex_mode)) {
+		/* pairs with smp_store_release() */
+		if (!smp_load_acquire(&sma->use_global_lock)) {
 			/* fast path successful! */
 			return sops->sem_num;
 		}
@@ -360,19 +389,26 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,
 	/* slow path: acquire the full lock */
 	ipc_lock_object(&sma->sem_perm);
 
-	if (sma->complex_count == 0) {
-		/* False alarm:
-		 * There is no complex operation, thus we can switch
-		 * back to the fast path.
+	if (sma->use_global_lock == 0) {
+		/*
+		 * The use_global_lock mode ended while we waited for
+		 * sma->sem_perm.lock. Thus we must switch to locking
+		 * with sem->lock.
+		 * Unlike in the fast path, there is no need to recheck
+		 * sma->use_global_lock after we have acquired sem->lock:
+		 * We own sma->sem_perm.lock, thus use_global_lock cannot
+		 * change.
 		 */
 		spin_lock(&sem->lock);
+
 		ipc_unlock_object(&sma->sem_perm);
 		return sops->sem_num;
 	} else {
-		/* Not a false alarm, thus complete the sequence for a
-		 * full lock.
+		/*
+		 * Not a false alarm, thus continue to use the global lock
+		 * mode. No need for complexmode_enter(), this was done by
+		 * the caller that has set use_global_mode to non-zero.
 		 */
-		complexmode_enter(sma);
 		return SEM_GLOBAL_LOCK;
 	}
 }
@@ -476,7 +512,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	}
 
 	sma->complex_count = 0;
-	sma->complex_mode = true; /* dropped by sem_unlock below */
+	sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS;
 	INIT_LIST_HEAD(&sma->pending_alter);
 	INIT_LIST_HEAD(&sma->pending_const);
 	INIT_LIST_HEAD(&sma->list_id);
-- 
cgit v1.2.3