From aad30c8ca049ac322ac180f123921f9c52b064d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2017 10:26:01 +1100 Subject: slub: make sysfs directories for memcg sub-caches optional SLUB creates a per-cache directory under /sys/kernel/slab which hosts a bunch of debug files. Usually, there aren't that many caches on a system and this doesn't really matter; however, if memcg is in use, each cache can have per-cgroup sub-caches. SLUB creates the same directories for these sub-caches under /sys/kernel/slab/$CACHE/cgroup. Unfortunately, because there can be a lot of cgroups, active or draining, the product of the numbers of caches, cgroups and files in each directory can reach a very high number - hundreds of thousands is commonplace. Millions and beyond aren't difficult to reach either. What's under /sys/kernel/slab is primarily for debugging and the information and control on the a root cache already cover its sub-caches. While having a separate directory for each sub-cache can be helpful for development, it doesn't make much sense to pay this amount of overhead by default. This patch introduces a boot parameter slub_memcg_sysfs which determines whether to create sysfs directories for per-memcg sub-caches. It also adds CONFIG_SLUB_MEMCG_SYSFS_ON which determines the boot parameter's default value and defaults to 0. Link: http://lkml.kernel.org/r/20170204145203.GB26958@mtj.duckdns.org Signed-off-by: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index be7c0d9506b1..44f7f76d7dbb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3679,6 +3679,14 @@ last alloc / free. For more information see Documentation/vm/slub.txt. + slub_memcg_sysfs= [MM, SLUB] + Determines whether to enable sysfs directories for + memory cgroup sub-caches. 1 to enable, 0 to disable. + The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON. + Enabling this can lead to a very high number of debug + directories and files being created under + /sys/kernel/slub. + slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory -- cgit v1.2.3 From fef7924f4c7bcf4a9b3069ba309580fbebe26746 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 8 Feb 2017 10:26:28 +1100 Subject: trace-vmscan-postprocess: sync with tracepoints updates Both mm_vmscan_lru_shrink_active and mm_vmscan_lru_isolate have changed so the script needs to be update to reflect those changes Link: http://lkml.kernel.org/r/20170105151737.GU21618@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Hillf Danton Acked-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- .../trace/postprocess/trace-vmscan-postprocess.pl | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 8f961ef2b457..ba976805853a 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -112,8 +112,8 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; -my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) file=([0-9]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -205,15 +205,15 @@ $regex_wakeup_kswapd = generate_traceevent_regex( $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, - "isolate_mode", "order", - "nr_requested", "nr_scanned", "nr_taken", - "file"); + "isolate_mode", "classzone_idx", "order", + "nr_requested", "nr_scanned", "nr_skipped", "nr_taken", + "lru"); $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, - "nid", "zid", - "nr_scanned", "nr_reclaimed", "priority", - "flags"); + "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", + "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", $regex_lru_shrink_active_default, @@ -381,8 +381,8 @@ EVENT_PROCESS: next; } my $isolate_mode = $1; - my $nr_scanned = $4; - my $file = $6; + my $nr_scanned = $5; + my $file = $8; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -391,7 +391,7 @@ EVENT_PROCESS: # isolate_both == 3 if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; - if ($file == 1) { + if ($file =~ /_file/) { $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; } else { $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; @@ -406,8 +406,8 @@ EVENT_PROCESS: next; } - my $nr_reclaimed = $4; - my $flags = $6; + my $nr_reclaimed = $3; + my $flags = $12; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; -- cgit v1.2.3 From 2b53b238a697f65c3a311baf0c3bd7503be44b73 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 8 Feb 2017 10:26:40 +1100 Subject: mm, thp: add new defer+madvise defrag option There is no thp defrag option that currently allows MADV_HUGEPAGE regions to do direct compaction and reclaim while all other thp allocations simply trigger kswapd and kcompactd in the background and fail immediately. The "defer" setting simply triggers background reclaim and compaction for all regions, regardless of MADV_HUGEPAGE, which makes it unusable for our userspace where MADV_HUGEPAGE is being used to indicate the application is willing to wait for work for thp memory to be available. The "madvise" setting will do direct compaction and reclaim for these MADV_HUGEPAGE regions, but does not trigger kswapd and kcompactd in the background for anybody else. For reasonable usage, there needs to be a mesh between the two options. This patch introduces a fifth mode, "defer+madvise", that will do direct reclaim and compaction for MADV_HUGEPAGE regions and trigger background reclaim and compaction for everybody else so that hugepages may be available in the near future. A proposal to allow direct reclaim and compaction for MADV_HUGEPAGE regions as part of the "defer" mode, making it a very powerful setting and avoids breaking userspace, was offered: http://marc.info/?t=148236612700003. This additional mode is a compromise. A second proposal to allow both "defer" and "madvise" to be selected at the same time was also offered: http://marc.info/?t=148357345300001. This is possible, but there was a concern that it might break existing userspaces the parse the output of the defrag mode, so the fifth option was introduced instead. This patch also cleans up the helper function for storing to "enabled" and "defrag" since the former supports three modes while the latter supports five and triple_flag_store() was getting unnecessarily messy. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701101614330.41805@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jonathan Corbet Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- Documentation/vm/transhuge.txt | 8 ++- include/linux/huge_mm.h | 1 + mm/huge_memory.c | 146 +++++++++++++++++++++-------------------- 3 files changed, 82 insertions(+), 73 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index c4171e4519c2..8fda5b7b24e9 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -110,6 +110,7 @@ MADV_HUGEPAGE region. echo always >/sys/kernel/mm/transparent_hugepage/defrag echo defer >/sys/kernel/mm/transparent_hugepage/defrag +echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag echo madvise >/sys/kernel/mm/transparent_hugepage/defrag echo never >/sys/kernel/mm/transparent_hugepage/defrag @@ -120,10 +121,15 @@ that benefit heavily from THP use and are willing to delay the VM start to utilise them. "defer" means that an application will wake kswapd in the background -to reclaim pages and wake kcompact to compact memory so that THP is +to reclaim pages and wake kcompactd to compact memory so that THP is available in the near future. It's the responsibility of khugepaged to then install the THP pages later. +"defer+madvise" will enter direct reclaim and compaction like "always", but +only for regions that have used madvise(MADV_HUGEPAGE); all other regions +will wake kswapd in the background to reclaim pages and wake kcompactd to +compact memory so that THP is available in the near future. + "madvise" will enter direct reclaim like "always" but only for regions that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 97e478d6b690..f0029e786205 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -33,6 +33,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f3ad65c85de..f9ecc2aeadfc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = { }; #ifdef CONFIG_SYSFS - -static ssize_t triple_flag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count, - enum transparent_hugepage_flag enabled, - enum transparent_hugepage_flag deferred, - enum transparent_hugepage_flag req_madv) -{ - if (!memcmp("defer", buf, - min(sizeof("defer")-1, count))) { - if (enabled == deferred) - return -EINVAL; - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(deferred, &transparent_hugepage_flags); - } else if (!memcmp("always", buf, - min(sizeof("always")-1, count))) { - clear_bit(deferred, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - set_bit(enabled, &transparent_hugepage_flags); - } else if (!memcmp("madvise", buf, - min(sizeof("madvise")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - set_bit(req_madv, &transparent_hugepage_flags); - } else if (!memcmp("never", buf, - min(sizeof("never")-1, count))) { - clear_bit(enabled, &transparent_hugepage_flags); - clear_bit(req_madv, &transparent_hugepage_flags); - clear_bit(deferred, &transparent_hugepage_flags); - } else - return -EINVAL; - - return count; -} - static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - ssize_t ret; + ssize_t ret = count; - ret = triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + ret = -EINVAL; if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } - return ret; } static struct kobj_attribute enabled_attr = @@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, return count; } -/* - * Currently defrag only disables __GFP_NOWAIT for allocation. A blind - * __GFP_REPEAT is too aggressive, it's never worth swapping tons of - * memory just to allocate one more hugepage. - */ static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] defer madvise never\n"); + return sprintf(buf, "[always] defer defer+madvise madvise never\n"); if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [defer] madvise never\n"); - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer [madvise] never\n"); - else - return sprintf(buf, "always defer madvise [never]\n"); - + return sprintf(buf, "always [defer] defer+madvise madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer [defer+madvise] madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer defer+madvise [madvise] never\n"); + return sprintf(buf, "always defer defer+madvise madvise [never]\n"); } + static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return triple_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("defer+madvise", buf, + min(sizeof("defer+madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; } static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); @@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, } /* - * If THP defrag is set to always then directly reclaim/compact as necessary - * If set to defer then do only background reclaim/compact and defer to khugepaged - * If set to madvise and the VMA is flagged then directly reclaim/compact - * When direct reclaim/compact is allowed, don't retry except for flagged VMA's + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation */ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); + const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, - &transparent_hugepage_flags) && vma_madvised) - return GFP_TRANSHUGE; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, - &transparent_hugepage_flags)) - return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; - else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, - &transparent_hugepage_flags)) + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); - + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : + 0); return GFP_TRANSHUGE_LIGHT; } -- cgit v1.2.3 From d256181021e1df9d24823e9eae10e1b2a9900cff Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 8 Feb 2017 10:26:57 +1100 Subject: zram: remove obsolete sysfs attrs We had a deprecated_attr_warn() warning for 2 years and now the time has come and we finally can do the cleanup. The plan was as follows: : per-stat sysfs attributes are considered to be deprecated. : The basic strategy is: : -- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) : -- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) : : The list of deprecated attributes can be found here: : Documentation/ABI/obsolete/sysfs-block-zram : : Basically, every attribute that has its own read accessible sysfs : node (e.g. num_reads) *AND* is accessible via one of the stat files : (zram/stat or zram/io_stat or zram/mm_stat) is considered : to be deprecated. The patch also removes `obsolete/sysfs-block-zram', clean ups `testing/sysfs-block-zram' and tweaks zram.txt files. Link: http://lkml.kernel.org/r/20170118035838.11090-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/ABI/obsolete/sysfs-block-zram | 119 ---------------------------- Documentation/ABI/testing/sysfs-block-zram | 101 ++--------------------- Documentation/blockdev/zram.txt | 74 ++++++++--------- drivers/block/zram/zram_drv.c | 101 +---------------------- 4 files changed, 42 insertions(+), 353 deletions(-) delete mode 100644 Documentation/ABI/obsolete/sysfs-block-zram (limited to 'Documentation') diff --git a/Documentation/ABI/obsolete/sysfs-block-zram b/Documentation/ABI/obsolete/sysfs-block-zram deleted file mode 100644 index 720ea92cfb2e..000000000000 --- a/Documentation/ABI/obsolete/sysfs-block-zram +++ /dev/null @@ -1,119 +0,0 @@ -What: /sys/block/zram/num_reads -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - Now accessible via zram/stat node. - -What: /sys/block/zram/num_writes -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - Now accessible via zram/stat node. - -What: /sys/block/zram/invalid_io -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/failed_reads -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/failed_writes -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/notify_free -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - Now accessible via zram/io_stat node. - -What: /sys/block/zram/zero_pages -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/orig_data_size -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/compr_data_size -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/mem_used_total -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - Now accessible via zram/mm_stat node. - -What: /sys/block/zram/mem_used_max -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. - Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram/mm_stat - node. - -What: /sys/block/zram/mem_limit -Date: August 2015 -Contact: Sergey Senozhatsky -Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. - The limit could be changed in run time and "0" means disable - the limit. No limit is the initial state. Unit: bytes - Downgraded to write-only node: so it's possible to set new - value only; its current value is stored in zram/mm_stat - node. diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 4518d30b8c2e..451b6d882b2c 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -22,41 +22,6 @@ Description: device. The reset operation frees all the memory associated with this device. -What: /sys/block/zram/num_reads -Date: August 2010 -Contact: Nitin Gupta -Description: - The num_reads file is read-only and specifies the number of - reads (failed or successful) done on this device. - -What: /sys/block/zram/num_writes -Date: August 2010 -Contact: Nitin Gupta -Description: - The num_writes file is read-only and specifies the number of - writes (failed or successful) done on this device. - -What: /sys/block/zram/invalid_io -Date: August 2010 -Contact: Nitin Gupta -Description: - The invalid_io file is read-only and specifies the number of - non-page-size-aligned I/O requests issued to this device. - -What: /sys/block/zram/failed_reads -Date: February 2014 -Contact: Sergey Senozhatsky -Description: - The failed_reads file is read-only and specifies the number of - failed reads happened on this device. - -What: /sys/block/zram/failed_writes -Date: February 2014 -Contact: Sergey Senozhatsky -Description: - The failed_writes file is read-only and specifies the number of - failed writes happened on this device. - What: /sys/block/zram/max_comp_streams Date: February 2014 Contact: Sergey Senozhatsky @@ -73,74 +38,24 @@ Description: available and selected compression algorithms, change compression algorithm selection. -What: /sys/block/zram/notify_free -Date: August 2010 -Contact: Nitin Gupta -Description: - The notify_free file is read-only. Depending on device usage - scenario it may account a) the number of pages freed because - of swap slot free notifications or b) the number of pages freed - because of REQ_DISCARD requests sent by bio. The former ones - are sent to a swap block device when a swap slot is freed, which - implies that this disk is being used as a swap disk. The latter - ones are sent by filesystem mounted with discard option, - whenever some data blocks are getting discarded. - -What: /sys/block/zram/zero_pages -Date: August 2010 -Contact: Nitin Gupta -Description: - The zero_pages file is read-only and specifies number of zero - filled pages written to this disk. No memory is allocated for - such pages. - -What: /sys/block/zram/orig_data_size -Date: August 2010 -Contact: Nitin Gupta -Description: - The orig_data_size file is read-only and specifies uncompressed - size of data stored in this disk. This excludes zero-filled - pages (zero_pages) since no memory is allocated for them. - Unit: bytes - -What: /sys/block/zram/compr_data_size -Date: August 2010 -Contact: Nitin Gupta -Description: - The compr_data_size file is read-only and specifies compressed - size of data stored in this disk. So, compression ratio can be - calculated using orig_data_size and this statistic. - Unit: bytes - -What: /sys/block/zram/mem_used_total -Date: August 2010 -Contact: Nitin Gupta -Description: - The mem_used_total file is read-only and specifies the amount - of memory, including allocator fragmentation and metadata - overhead, allocated for this disk. So, allocator space - efficiency can be calculated using compr_data_size and this - statistic. - Unit: bytes - What: /sys/block/zram/mem_used_max Date: August 2014 Contact: Minchan Kim Description: - The mem_used_max file is read/write and specifies the amount - of maximum memory zram have consumed to store compressed data. - For resetting the value, you should write "0". Otherwise, - you could see -EINVAL. + The mem_used_max file is write-only and is used to reset + the counter of maximum memory zram have consumed to store + compressed data. For resetting the value, you should write + "0". Otherwise, you could see -EINVAL. Unit: bytes What: /sys/block/zram/mem_limit Date: August 2014 Contact: Minchan Kim Description: - The mem_limit file is read/write and specifies the maximum - amount of memory ZRAM can use to store the compressed data. The - limit could be changed in run time and "0" means disable the - limit. No limit is the initial state. Unit: bytes + The mem_limit file is write-only and specifies the maximum + amount of memory ZRAM can use to store the compressed data. + The limit could be changed in run time and "0" means disable + the limit. No limit is the initial state. Unit: bytes What: /sys/block/zram/compact Date: August 2015 diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 0535ae1f73e5..1c0c08d9206b 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -161,42 +161,14 @@ Name access description disksize RW show and set the device's disk size initstate RO shows the initialization state of the device reset WO trigger device reset -num_reads RO the number of reads -failed_reads RO the number of failed reads -num_write RO the number of writes -failed_writes RO the number of failed writes -invalid_io RO the number of non-page-size-aligned I/O requests +mem_used_max WO reset the `mem_used_max' counter (see later) +mem_limit WO specifies the maximum amount of memory ZRAM can use + to store the compressed data max_comp_streams RW the number of possible concurrent compress operations comp_algorithm RW show and change the compression algorithm -notify_free RO the number of notifications to free pages (either - slot free notifications or REQ_DISCARD requests) -zero_pages RO the number of zero filled pages written to this disk -orig_data_size RO uncompressed size of data stored in this disk -compr_data_size RO compressed size of data stored in this disk -mem_used_total RO the amount of memory allocated for this disk -mem_used_max RW the maximum amount of memory zram have consumed to - store the data (to reset this counter to the actual - current value, write 1 to this attribute) -mem_limit RW the maximum amount of memory ZRAM can use to store - the compressed data -pages_compacted RO the number of pages freed during compaction - (available only via zram/mm_stat node) compact WO trigger memory compaction debug_stat RO this file is used for zram debugging purposes -WARNING -======= -per-stat sysfs attributes are considered to be deprecated. -The basic strategy is: --- the existing RW nodes will be downgraded to WO nodes (in linux 4.11) --- deprecated RO sysfs nodes will eventually be removed (in linux 4.11) - -The list of deprecated attributes can be found here: -Documentation/ABI/obsolete/sysfs-block-zram - -Basically, every attribute that has its own read accessible sysfs node -(e.g. num_reads) *AND* is accessible via one of the stat files (zram/stat -or zram/io_stat or zram/mm_stat) is considered to be deprecated. User space is advised to use the following files to read the device statistics. @@ -211,22 +183,40 @@ The stat file represents device's I/O statistics not accounted by block layer and, thus, not available in zram/stat file. It consists of a single line of text and contains the following stats separated by whitespace: - failed_reads - failed_writes - invalid_io - notify_free + failed_reads the number of failed reads + failed_writes the number of failed writes + invalid_io the number of non-page-size-aligned I/O requests + notify_free Depending on device usage scenario it may account + a) the number of pages freed because of swap slot free + notifications or b) the number of pages freed because of + REQ_DISCARD requests sent by bio. The former ones are + sent to a swap block device when a swap slot is freed, + which implies that this disk is being used as a swap disk. + The latter ones are sent by filesystem mounted with + discard option, whenever some data blocks are getting + discarded. File /sys/block/zram/mm_stat The stat file represents device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: - orig_data_size - compr_data_size - mem_used_total - mem_limit - mem_used_max - zero_pages - num_migrated + orig_data_size uncompressed size of data stored in this disk. + This excludes zero-filled pages (zero_pages) since no + memory is allocated for them. + Unit: bytes + compr_data_size compressed size of data stored in this disk + mem_used_total the amount of memory allocated for this disk. This + includes allocator fragmentation and metadata overhead, + allocated for this disk. So, allocator space efficiency + can be calculated using compr_data_size and this statistic. + Unit: bytes + mem_limit the maximum amount of memory ZRAM can use to store + the compressed data + mem_used_max the maximum amount of memory zram have consumed to + store the data + zero_pages the number of zero filled pages written to this disk. + No memory is allocated for such pages. + pages_compacted the number of pages freed during compaction 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e5ab7d9e8c45..85737b6474ac 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -45,27 +45,6 @@ static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; -static inline void deprecated_attr_warn(const char *name) -{ - pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", - task_pid_nr(current), - current->comm, - name, - "See zram documentation."); -} - -#define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ - struct device_attribute *attr, char *b) \ -{ \ - struct zram *zram = dev_to_zram(d); \ - \ - deprecated_attr_warn(__stringify(name)); \ - return scnprintf(b, PAGE_SIZE, "%llu\n", \ - (u64)atomic64_read(&zram->stats.name)); \ -} \ -static DEVICE_ATTR_RO(name); - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -218,47 +197,6 @@ static ssize_t disksize_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("orig_data_size"); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_total"); - down_read(&zram->init_lock); - if (init_done(zram)) { - struct zram_meta *meta = zram->meta; - val = zs_get_total_pages(meta->mem_pool); - } - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - -static ssize_t mem_limit_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_limit"); - down_read(&zram->init_lock); - val = zram->limit_pages; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -277,21 +215,6 @@ static ssize_t mem_limit_store(struct device *dev, return len; } -static ssize_t mem_used_max_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - deprecated_attr_warn("mem_used_max"); - down_read(&zram->init_lock); - if (init_done(zram)) - val = atomic_long_read(&zram->stats.max_used_pages); - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); -} - static ssize_t mem_used_max_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -467,14 +390,6 @@ static ssize_t debug_stat_show(struct device *dev, static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); static inline bool zram_meta_get(struct zram *zram) { @@ -1188,10 +1103,8 @@ static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); static DEVICE_ATTR_RO(initstate); static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_WO(mem_limit); +static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); @@ -1199,17 +1112,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_failed_reads.attr, - &dev_attr_failed_writes.attr, &dev_attr_compact.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, &dev_attr_mem_limit.attr, &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, -- cgit v1.2.3 From bbd00396d44e26899cb5339605fa6e93da2e14c9 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 8 Feb 2017 10:26:59 +1100 Subject: mm/ksm: documentation for coloured zero pages deduplication This patch adds the needed documentation for the use_zero_pages property. Link: http://lkml.kernel.org/r/1484927522-1964-1-git-send-email-imbrenda@linux.vnet.ibm.com Signed-off-by: Claudio Imbrenda Cc: Christian Borntraeger Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- Documentation/vm/ksm.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'Documentation') diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index f34a8ee6f860..0c64a81d6808 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -80,6 +80,20 @@ run - set 0 to stop ksmd from running but keep merged pages, Default: 0 (must be changed to 1 to activate KSM, except if CONFIG_SYSFS is disabled) +use_zero_pages - specifies whether empty pages (i.e. allocated pages + that only contain zeroes) should be treated specially. + When set to 1, empty pages are merged with the kernel + zero page(s) instead of with each other as it would + happen normally. This can improve the performance on + architectures with coloured zero pages, depending on + the workload. Care should be taken when enabling this + setting, as it can potentially degrade the performance + of KSM for some workloads, for example if the checksums + of pages candidate for merging match the checksum of + an empty page. This setting can be changed at any time, + it is only effective for pages merged after the change. + Default: 0 (normal KSM behaviour as in earlier releases) + The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: pages_shared - how many shared pages are being used -- cgit v1.2.3 From f09d0de5d2b7d47830aba3c493763cff016c355b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 8 Feb 2017 10:27:38 +1100 Subject: mm, madvise: fail with ENOMEM when splitting vma will hit max_map_count If madvise(2) advice will result in the underlying vma being split and the number of areas mapped by the process will exceed /proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN. EAGAIN is returned by madvise(2) when a kernel resource, such as slab, is temporarily unavailable. It indicates that userspace should retry the advice in the near future. This is important for advice such as MADV_DONTNEED which is often used by malloc implementations to free memory back to the system: we really do want to free memory back when madvise(2) returns EAGAIN because slab allocations (for vmas, anon_vmas, or mempolicies) cannot be allocated. Encountering /proc/sys/vm/max_map_count is not a temporary failure, however, so return ENOMEM to indicate this is a more serious issue. A followup patch to the man page will specify this behavior. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Jonathan Corbet Cc: Johannes Weiner Cc: Jerome Marchand Cc: "Kirill A. Shutemov" Cc: Michael Kerrisk Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- Documentation/sysctl/vm.txt | 4 ++-- Documentation/vm/ksm.txt | 4 ++++ include/linux/mm.h | 6 ++++-- mm/madvise.c | 51 +++++++++++++++++++++++++++++++++++++-------- mm/mmap.c | 8 +++---- 5 files changed, 56 insertions(+), 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 95ccbe6d79ce..b4ad97f10b8e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -376,8 +376,8 @@ max_map_count: This file contains the maximum number of memory map areas a process may have. Memory map areas are used as a side-effect of calling -malloc, directly by mmap and mprotect, and also when loading shared -libraries. +malloc, directly by mmap, mprotect, and madvise, and also when loading +shared libraries. While most applications need less than a thousand maps, certain programs, particularly malloc debuggers, may consume lots of them, diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 0c64a81d6808..6b0ca7feb135 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -38,6 +38,10 @@ the range for whenever the KSM daemon is started; even if the range cannot contain any pages which KSM could actually merge; even if MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. +If a region of memory must be split into at least one new MADV_MERGEABLE +or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process +will exceed vm.max_map_count (see Documentation/sysctl/vm.txt). + Like other madvise calls, they are intended for use on mapped areas of the user address space: they will report ENOMEM if the specified range includes unmapped gaps (though working on the intervening mapped areas), diff --git a/include/linux/mm.h b/include/linux/mm.h index 4a1e09316ed1..2be2ea04b1c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2037,8 +2037,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int split_vma(struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct mm_struct *, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct mm_struct *, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); diff --git a/mm/madvise.c b/mm/madvise.c index 0012071a6e50..11fc65f81ecd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -92,14 +92,28 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; } @@ -120,15 +134,37 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; goto out; + } + error = __split_vma(mm, vma, start, 1); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + goto out; + } } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; + goto out; + } + error = __split_vma(mm, vma, end, 0); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } } success: @@ -136,10 +172,7 @@ success: * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; - out: - if (error == -ENOMEM) - error = -EAGAIN; return error; } diff --git a/mm/mmap.c b/mm/mmap.c index 1cec28d20583..499b988b1639 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2499,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * __split_vma() bypasses sysctl_max_map_count checking. We use this on the - * munmap path where it doesn't make sense to fail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it + * has already been checked or doesn't make sense to fail. */ -static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; int err; -- cgit v1.2.3 From 79fe35d9a263893eeddb94868a1d4ac541432e32 Mon Sep 17 00:00:00 2001 From: zhouxianrong Date: Wed, 8 Feb 2017 10:27:44 +1100 Subject: zram: extend zero pages to same element pages The idea is that without doing more calculations we extend zero pages to same element pages for zram. zero page is special case of same element page with zero element. 1. the test is done under android 7.0 2. startup too many applications circularly 3. sample the zero pages, same pages (none-zero element) and total pages in function page_zero_filled the result is listed as below: ZERO SAME TOTAL 36214 17842 598196 ZERO/TOTAL SAME/TOTAL (ZERO+SAME)/TOTAL ZERO/SAME AVERAGE 0.060631909 0.024990816 0.085622726 2.663825038 STDEV 0.00674612 0.005887625 0.009707034 2.115881328 MAX 0.069698422 0.030046087 0.094975336 7.56043956 MIN 0.03959586 0.007332205 0.056055193 1.928985507 from the above data, the benefit is about 2.5% and up to 3% of total swapout pages. The defect of the patch is that when we recovery a page from non-zero element the operations are low efficient for partial read. This patch extends zero_page to same_page so if there is any user to have monitored zero_pages, he will be surprised if the number is increased but it's not harmful, I believe. Link: http://lkml.kernel.org/r/1483692145-75357-1-git-send-email-zhouxianrong@huawei.com Link: http://lkml.kernel.org/r/1486307804-27903-1-git-send-email-minchan@kernel.org Signed-off-by: zhouxianrong Signed-off-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- Documentation/blockdev/zram.txt | 6 ++-- drivers/block/zram/zram_drv.c | 80 ++++++++++++++++++++++++++++------------- drivers/block/zram/zram_drv.h | 9 +++-- 3 files changed, 64 insertions(+), 31 deletions(-) (limited to 'Documentation') diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 1c0c08d9206b..4fced8a21307 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -201,8 +201,8 @@ File /sys/block/zram/mm_stat The stat file represents device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: orig_data_size uncompressed size of data stored in this disk. - This excludes zero-filled pages (zero_pages) since no - memory is allocated for them. + This excludes same-element-filled pages (same_pages) since + no memory is allocated for them. Unit: bytes compr_data_size compressed size of data stored in this disk mem_used_total the amount of memory allocated for this disk. This @@ -214,7 +214,7 @@ line of text and contains the following stats separated by whitespace: the compressed data mem_used_max the maximum amount of memory zram have consumed to store the data - zero_pages the number of zero filled pages written to this disk. + same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index eaeff0696420..c20b05a84f21 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -74,6 +74,17 @@ static void zram_clear_flag(struct zram_meta *meta, u32 index, meta->table[index].value &= ~BIT(flag); } +static inline void zram_set_element(struct zram_meta *meta, u32 index, + unsigned long element) +{ + meta->table[index].element = element; +} + +static inline void zram_clear_element(struct zram_meta *meta, u32 index) +{ + meta->table[index].element = 0; +} + static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) { return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); @@ -146,31 +157,46 @@ static inline void update_used_max(struct zram *zram, } while (old_max != cur_max); } -static bool page_zero_filled(void *ptr) +static inline void zram_fill_page(char *ptr, unsigned long len, + unsigned long value) +{ + int i; + unsigned long *page = (unsigned long *)ptr; + + WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long))); + + if (likely(value == 0)) { + memset(ptr, 0, len); + } else { + for (i = 0; i < len / sizeof(*page); i++) + page[i] = value; + } +} + +static bool page_same_filled(void *ptr, unsigned long *element) { unsigned int pos; unsigned long *page; page = (unsigned long *)ptr; - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) + for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) { + if (page[pos] != page[pos + 1]) return false; } + *element = page[pos]; + return true; } -static void handle_zero_page(struct bio_vec *bvec) +static void handle_same_page(struct bio_vec *bvec, unsigned long element) { struct page *page = bvec->bv_page; void *user_mem; user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); + zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element); kunmap_atomic(user_mem); flush_dcache_page(page); @@ -363,7 +389,7 @@ static ssize_t mm_stat_show(struct device *dev, mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.same_pages), pool_stats.pages_compacted); up_read(&zram->init_lock); @@ -450,18 +476,20 @@ static void zram_free_page(struct zram *zram, size_t index) struct zram_meta *meta = zram->meta; unsigned long handle = meta->table[index].handle; - if (unlikely(!handle)) { - /* - * No memory is allocated for zero filled pages. - * Simply clear zero page flag. - */ - if (zram_test_flag(meta, index, ZRAM_ZERO)) { - zram_clear_flag(meta, index, ZRAM_ZERO); - atomic64_dec(&zram->stats.zero_pages); - } + /* + * No memory is allocated for same element filled pages. + * Simply clear same page flag. + */ + if (zram_test_flag(meta, index, ZRAM_SAME)) { + zram_clear_flag(meta, index, ZRAM_SAME); + zram_clear_element(meta, index); + atomic64_dec(&zram->stats.same_pages); return; } + if (!handle) + return; + zs_free(meta->mem_pool, handle); atomic64_sub(zram_get_obj_size(meta, index), @@ -484,9 +512,9 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); - if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) { bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - clear_page(mem); + zram_fill_page(mem, PAGE_SIZE, meta->table[index].element); return 0; } @@ -522,9 +550,9 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_ZERO)) { + zram_test_flag(meta, index, ZRAM_SAME)) { bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - handle_zero_page(bvec); + handle_same_page(bvec, meta->table[index].element); return 0; } bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); @@ -572,6 +600,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct zram_meta *meta = zram->meta; struct zcomp_strm *zstrm = NULL; unsigned long alloced_pages; + unsigned long element; page = bvec->bv_page; if (is_partial_io(bvec)) { @@ -600,16 +629,17 @@ compress_again: uncmem = user_mem; } - if (page_zero_filled(uncmem)) { + if (page_same_filled(uncmem, &element)) { if (user_mem) kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - zram_set_flag(meta, index, ZRAM_ZERO); + zram_set_flag(meta, index, ZRAM_SAME); + zram_set_element(meta, index, element); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); - atomic64_inc(&zram->stats.zero_pages); + atomic64_inc(&zram->stats.same_pages); ret = 0; goto out; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 2692554b7737..caeff51f1571 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -61,7 +61,7 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_SAME = ZRAM_FLAG_SHIFT, ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, @@ -71,7 +71,10 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { - unsigned long handle; + union { + unsigned long handle; + unsigned long element; + }; unsigned long value; }; @@ -83,7 +86,7 @@ struct zram_stats { atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t same_pages; /* no. of same element filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ -- cgit v1.2.3 From 417f812342a519464793bd32ef3c6e0a62ded51a Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 8 Feb 2017 10:28:05 +1100 Subject: autofs: fix typo in Documentation Link: http://lkml.kernel.org/r/148577164606.9801.12571810310561599401.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton --- Documentation/filesystems/autofs4.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index 8fac3fe7b8c9..a5dc56ff0a0e 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -65,7 +65,7 @@ directory is a mount trap only if the filesystem is mounted *direct* and the root is empty. Directories created in the root directory are mount traps only if the -filesystem is mounted *indirect* and they are empty. +filesystem is mounted *indirect* and they are empty. Directories further down the tree depend on the *maxproto* mount option and particularly whether it is less than five or not. @@ -352,7 +352,7 @@ Communicating with autofs: root directory ioctls ------------------------------------------------ The root directory of an autofs filesystem will respond to a number of -ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN +ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN capability, or must be the automount daemon. The available ioctl commands are: @@ -512,7 +512,7 @@ always be mounted "shared". e.g. > `mount --make-shared /autofs/mount/point` -The automount daemon is only able to mange a single mount location for +The automount daemon is only able to manage a single mount location for an autofs filesystem and if mounts on that are not 'shared', other locations will not behave as expected. In particular access to those other locations will likely result in the `ELOOP` error -- cgit v1.2.3 From 2ad98775db07b01605ca85387fa455a01e06c5ee Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 8 Feb 2017 10:28:06 +1100 Subject: autofs: fix wrong ioctl documentation regarding devid This is the same as d8732841 except that it's a different file. A caller has no devid input, and devid is obtained via superblock. Link: http://lkml.kernel.org/r/148577165119.9801.16967562019122274820.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton --- Documentation/filesystems/autofs4.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index a5dc56ff0a0e..75348631a3d8 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -457,9 +457,8 @@ Commands are: daemon. - **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a name within the filesystem that has been auto-mounted on. - arg1 is the dev number of the underlying autofs. On successful - return, `arg1` and `arg2` will be the UID and GID of the process - which triggered that mount. + On successful return, `arg1` and `arg2` will be the UID and GID + of the process which triggered that mount. - **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a mountpoint of a particular type - see separate documentation for -- cgit v1.2.3 From 997a28e4d5a7bbc65b2404356d0696577f2a3b9e Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 8 Feb 2017 10:28:07 +1100 Subject: autofs: update ioctl documentation regarding struct autofs_dev_ioctl This is the same as bf72eda5 except that it's a different file. Sync documentation with changes made by 730c9eec in 2009. Link: http://lkml.kernel.org/r/148577165630.9801.6081791213151121657.stgit@pluto.themaw.net Signed-off-by: Tomohiro Kusumi Signed-off-by: Ian Kent Signed-off-by: Andrew Morton --- .../filesystems/autofs4-mount-control.txt | 1 + Documentation/filesystems/autofs4.txt | 32 ++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 50a3e01a36f8..e5177cb31a04 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt @@ -179,6 +179,7 @@ struct autofs_dev_ioctl { * including this struct */ __s32 ioctlfd; /* automount command fd */ + /* Command parameters */ union { struct args_protover protover; struct args_protosubver protosubver; diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index 75348631a3d8..f10dd590f69f 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -425,8 +425,20 @@ Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: * including this struct */ __s32 ioctlfd; /* automount command fd */ - __u32 arg1; /* Command parameters */ - __u32 arg2; + /* Command parameters */ + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; char path[0]; }; @@ -446,24 +458,22 @@ Commands are: set version numbers. - **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor on the root of an autofs filesystem. The filesystem is identified - by name and device number, which is stored in `arg1`. Device - numbers for existing filesystems can be found in + by name and device number, which is stored in `openmount.devid`. + Device numbers for existing filesystems can be found in `/proc/self/mountinfo`. - **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`. - **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in catatonic mode, this can provide the write end of a new pipe - in `arg1` to re-establish communication with a daemon. The - process group of the calling process is used to identify the + in `setpipefd.pipefd` to re-establish communication with a daemon. + The process group of the calling process is used to identify the daemon. - **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a name within the filesystem that has been auto-mounted on. - On successful return, `arg1` and `arg2` will be the UID and GID - of the process which triggered that mount. - + On successful return, `requester.uid` and `requester.gid` will be + the UID and GID of the process which triggered that mount. - **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a mountpoint of a particular type - see separate documentation for details. - - **AUTOFS_DEV_IOCTL_PROTOVER_CMD**: - **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**: - **AUTOFS_DEV_IOCTL_READY_CMD**: @@ -473,7 +483,7 @@ Commands are: - **AUTOFS_DEV_IOCTL_EXPIRE_CMD**: - **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**: These all have the same function as the similarly named **AUTOFS_IOC** ioctls, except - that **FAIL** can be given an explicit error number in `arg1` + that **FAIL** can be given an explicit error number in `fail.status` instead of assuming `ENOENT`, and this **EXPIRE** command corresponds to **AUTOFS_IOC_EXPIRE_MULTI**. -- cgit v1.2.3