summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bounce.c44
-rw-r--r--mm/filemap.c42
-rw-r--r--mm/huge_memory.c14
-rw-r--r--mm/internal.h1
-rw-r--r--mm/memblock.c7
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/mempolicy.c26
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c57
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/shmem.c57
-rw-r--r--mm/slab_common.c19
-rw-r--r--mm/slub.c31
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/zsmalloc.c1106
21 files changed, 1304 insertions, 207 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 723bbe04a0b0..2d9f1504d75e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -552,3 +552,28 @@ config MEM_SOFT_DIRTY
it can be cleared by hands.
See Documentation/vm/soft-dirty.txt for more details.
+
+config ZSMALLOC
+ bool "Memory allocator for compressed pages"
+ depends on MMU
+ default n
+ help
+ zsmalloc is a slab-based memory allocator designed to store
+ compressed RAM pages. zsmalloc uses virtual memory mapping
+ in order to reduce fragmentation. However, this results in a
+ non-standard allocator interface where a handle, not a pointer, is
+ returned by an alloc(). This handle must be mapped in order to
+ access the allocated space.
+
+config PGTABLE_MAPPING
+ bool "Use page table mapping to access object in zsmalloc"
+ depends on ZSMALLOC
+ help
+ By default, zsmalloc uses a copy-based object mapping method to
+ access allocations that span two pages. However, if a particular
+ architecture (ex, ARM) performs VM mapping faster than copying,
+ then you should select this. This causes zsmalloc to use page table
+ mapping rather than copying for object mapping.
+
+ You can check speed with zsmalloc benchmark[1].
+ [1] https://github.com/spartacus06/zsmalloc
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..310c90a09264 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZBUD) += zbud.o
+obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 5a7d58fb883b..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -98,27 +98,24 @@ int init_emergency_isa_pool(void)
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
- struct bio_vec *tovec, *fromvec;
- int i;
-
- bio_for_each_segment(tovec, to, i) {
- fromvec = from->bi_io_vec + i;
-
- /*
- * not bounced
- */
- if (tovec->bv_page == fromvec->bv_page)
- continue;
-
- /*
- * fromvec->bv_offset and fromvec->bv_len might have been
- * modified by the block layer, so use the original copy,
- * bounce_copy_vec already uses tovec->bv_len
- */
- vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+ struct bio_vec tovec, *fromvec = from->bi_io_vec;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(tovec, to, iter) {
+ if (tovec.bv_page != fromvec->bv_page) {
+ /*
+ * fromvec->bv_offset and fromvec->bv_len might have
+ * been modified by the block layer, so use the original
+ * copy, bounce_copy_vec already uses tovec->bv_len
+ */
+ vfrom = page_address(fromvec->bv_page) +
+ tovec.bv_offset;
+
+ bounce_copy_vec(&tovec, vfrom);
+ flush_dcache_page(tovec.bv_page);
+ }
- bounce_copy_vec(tovec, vfrom);
- flush_dcache_page(tovec->bv_page);
+ fromvec++;
}
}
@@ -201,13 +198,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
{
struct bio *bio;
int rw = bio_data_dir(*bio_orig);
- struct bio_vec *to, *from;
+ struct bio_vec *to, from;
+ struct bvec_iter iter;
unsigned i;
if (force)
goto bounce;
- bio_for_each_segment(from, *bio_orig, i)
- if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+ bio_for_each_segment(from, *bio_orig, iter)
+ if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
goto bounce;
return;
diff --git a/mm/filemap.c b/mm/filemap.c
index 7a7f3e0db738..d56d3c145b9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1428,30 +1428,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
- if (pos < size) {
- retval = filemap_write_and_wait_range(mapping, pos,
+ retval = filemap_write_and_wait_range(mapping, pos,
pos + iov_length(iov, nr_segs) - 1);
- if (!retval) {
- retval = mapping->a_ops->direct_IO(READ, iocb,
- iov, pos, nr_segs);
- }
- if (retval > 0) {
- *ppos = pos + retval;
- count -= retval;
- }
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+ }
+ if (retval > 0) {
+ *ppos = pos + retval;
+ count -= retval;
+ }
- /*
- * Btrfs can have a short DIO read if we encounter
- * compressed extents, so if there was an error, or if
- * we've already read everything we wanted to, or if
- * there was a short read because we hit EOF, go ahead
- * and return. Otherwise fallthrough to buffered io for
- * the rest of the read.
- */
- if (retval < 0 || !count || *ppos >= size) {
- file_accessed(filp);
- goto out;
- }
+ /*
+ * Btrfs can have a short DIO read if we encounter
+ * compressed extents, so if there was an error, or if
+ * we've already read everything we wanted to, or if
+ * there was a short read because we hit EOF, go ahead
+ * and return. Otherwise fallthrough to buffered io for
+ * the rest of the read.
+ */
+ if (retval < 0 || !count || *ppos >= size) {
+ file_accessed(filp);
+ goto out;
}
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 65c98eb5483c..82166bf974e1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1508,19 +1508,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- if (new_ptl != old_ptl) {
- pgtable_t pgtable;
- /*
- * Move preallocated PTE page table if new_pmd is on
- * different PMD page table.
- */
+ if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
- spin_unlock(new_ptl);
}
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
out:
diff --git a/mm/internal.h b/mm/internal.h
index 612c14f5e0f5..29e1e761f9eb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,7 +83,6 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern bool zone_reclaimable(struct zone *zone);
/*
diff --git a/mm/memblock.c b/mm/memblock.c
index 9c0aeef19440..39a31e7f0045 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -984,9 +984,6 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
if (!align)
align = SMP_CACHE_BYTES;
- /* align @size to avoid excessive fragmentation on reserved array */
- size = round_up(size, align);
-
found = memblock_find_in_range_node(size, align, 0, max_addr, nid);
if (found && !memblock_reserve(found, size))
return found;
@@ -1080,8 +1077,8 @@ static void * __init memblock_virt_alloc_internal(
if (!align)
align = SMP_CACHE_BYTES;
- /* align @size to avoid excessive fragmentation on reserved array */
- size = round_up(size, align);
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19d5d4274e22..53385cd4e6f0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3400,7 +3400,7 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *s)
{
- struct kmem_cache *new;
+ struct kmem_cache *new = NULL;
static char *tmp_name = NULL;
static DEFINE_MUTEX(mutex); /* protects tmp_name */
@@ -3416,7 +3416,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
if (!tmp_name) {
tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!tmp_name)
- return NULL;
+ goto out;
}
rcu_read_lock();
@@ -3426,12 +3426,11 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
(s->flags & ~SLAB_PANIC), s->ctor, s);
-
if (new)
new->allocflags |= __GFP_KMEMCG;
else
new = s;
-
+out:
mutex_unlock(&mutex);
return new;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 463b7fbf0d1d..ae3c8f3595d4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
return 0;
}
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
- BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
if (nr_updated)
@@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
{
return 0;
}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Walk through page tables and collect pages to be migrated.
@@ -2655,7 +2654,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
}
#ifdef CONFIG_NUMA_BALANCING
-static bool __initdata numabalancing_override;
+static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
@@ -2664,9 +2663,15 @@ static void __init check_numabalancing_enable(void)
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
+ /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+ if (numabalancing_override)
+ set_numabalancing_state(numabalancing_override == 1);
+
if (nr_node_ids > 1 && !numabalancing_override) {
- printk(KERN_INFO "Enabling automatic NUMA balancing. "
- "Configure with numa_balancing= or the kernel.numa_balancing sysctl");
+ pr_info("%s automatic NUMA balancing. "
+ "Configure with numa_balancing= or the "
+ "kernel.numa_balancing sysctl",
+ numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
@@ -2676,18 +2681,17 @@ static int __init setup_numabalancing(char *str)
int ret = 0;
if (!str)
goto out;
- numabalancing_override = true;
if (!strcmp(str, "enable")) {
- set_numabalancing_state(true);
+ numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_numabalancing_state(false);
+ numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
- printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+ pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
@@ -2926,7 +2930,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
- if (pol && pol != &default_policy) {
+ if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 734704f6f29b..482a33d89134 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1548,8 +1548,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
__GFP_NOMEMALLOC | __GFP_NORETRY |
__GFP_NOWARN) &
~GFP_IOFS, 0);
- if (newpage)
- page_cpupid_xchg_last(newpage, page_cpupid_last(page));
return newpage;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 857a6434e3a5..4074caf9936b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -202,4 +202,4 @@ static int __init mm_sysfs_init(void)
return 0;
}
-pure_initcall(mm_sysfs_init);
+postcore_initcall(mm_sysfs_init);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 37b1b1903fb2..3291e82d4352 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -178,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- adj -= 30;
+ points -= (points * 3) / 100;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63807583d8e8..2d30e2cfe804 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -191,6 +191,26 @@ static unsigned long writeout_period_time = 0;
* global dirtyable memory first.
*/
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long nr_pages;
+
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -198,11 +218,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ x += zone_dirtyable_memory(z);
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
@@ -238,9 +256,12 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x = global_page_state(NR_FREE_PAGES);
x -= min(x, dirty_balance_reserve);
+ x += global_page_state(NR_INACTIVE_FILE);
+ x += global_page_state(NR_ACTIVE_FILE);
+
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -289,32 +310,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
- /*
- * The effective global number of dirtyable pages may exclude
- * highmem as a big-picture measure to keep the ratio between
- * dirty memory and lowmem reasonable.
- *
- * But this function is purely about the individual zone and a
- * highmem zone can hold its share of dirty pages, so we don't
- * care about vm_highmem_is_dirtyable here.
- */
- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone);
-
- /* don't allow this to underflow */
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
- return nr_pages;
-}
-
-/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
diff --git a/mm/page_io.c b/mm/page_io.c
index 7247be6114ac..7c59ef681381 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
- bio->bi_sector <<= PAGE_SHIFT - 9;
+ bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_size = PAGE_SIZE;
+ bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err)
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err)
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 7cdbb44aa90b..0de2360d65f3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -211,8 +211,6 @@ out:
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
- int ret = 0;
-
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
@@ -226,15 +224,13 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk, 0);
- if (err < 0) {
- ret = err;
- break;
- }
- ret += err;
+ if (err < 0)
+ return err;
+
offset += this_chunk;
nr_to_read -= this_chunk;
}
- return ret;
+ return 0;
}
/*
@@ -576,8 +572,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops)
return -EINVAL;
- force_page_cache_readahead(mapping, filp, index, nr);
- return 0;
+ return force_page_cache_readahead(mapping, filp, index, nr);
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/shmem.c b/mm/shmem.c
index 8156f95ec0cf..1f18c9d0d93e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt;
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
-#include <linux/generic_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -620,10 +620,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
}
setattr_copy(inode, attr);
-#ifdef CONFIG_TMPFS_POSIX_ACL
if (attr->ia_valid & ATTR_MODE)
- error = generic_acl_chmod(inode);
-#endif
+ error = posix_acl_chmod(inode, inode->i_mode);
return error;
}
@@ -1937,22 +1935,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
-#ifdef CONFIG_TMPFS_POSIX_ACL
- error = generic_acl_init(inode, dir);
- if (error) {
- iput(inode);
- return error;
- }
-#endif
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
error = security_inode_init_security(inode, dir,
&dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
@@ -1961,6 +1951,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
dget(dentry); /* Extra count - pin the dentry in core */
}
return error;
+out_iput:
+ iput(inode);
+ return error;
}
static int
@@ -1974,24 +1967,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
error = security_inode_init_security(inode, dir,
NULL,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- }
-#ifdef CONFIG_TMPFS_POSIX_ACL
- error = generic_acl_init(inode, dir);
- if (error) {
- iput(inode);
- return error;
- }
-#else
- error = 0;
-#endif
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
d_tmpfile(dentry, inode);
}
return error;
+out_iput:
+ iput(inode);
+ return error;
}
static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2223,8 +2209,8 @@ static int shmem_initxattrs(struct inode *inode,
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
- &generic_acl_access_handler,
- &generic_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL
};
@@ -2740,6 +2726,7 @@ static const struct inode_operations shmem_inode_operations = {
.getxattr = shmem_getxattr,
.listxattr = shmem_listxattr,
.removexattr = shmem_removexattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2764,6 +2751,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2776,6 +2764,7 @@ static const struct inode_operations shmem_special_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8e40321da091..1ec3c619ba04 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -233,14 +233,17 @@ out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
- /*
- * There is no point in flooding logs with warnings or especially
- * crashing the system if we fail to create a cache for a memcg. In
- * this case we will be accounting the memcg allocation to the root
- * cgroup until we succeed to create its own cache, but it isn't that
- * critical.
- */
- if (err && !memcg) {
+ if (err) {
+ /*
+ * There is no point in flooding logs with warnings or
+ * especially crashing the system if we fail to create a cache
+ * for a memcg. In this case we will be accounting the memcg
+ * allocation to the root cgroup until we succeed to create its
+ * own cache, but it isn't that critical.
+ */
+ if (!memcg)
+ return NULL;
+
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
diff --git a/mm/slub.c b/mm/slub.c
index 34bb8c65a2d8..2b1a6970e46f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -355,6 +355,21 @@ static __always_inline void slab_unlock(struct page *page)
__bit_spin_unlock(PG_locked, &page->flags);
}
+static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
+{
+ struct page tmp;
+ tmp.counters = counters_new;
+ /*
+ * page->counters can cover frozen/inuse/objects as well
+ * as page->_count. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_count, so
+ * be careful and only assign to the fields we need.
+ */
+ page->frozen = tmp.frozen;
+ page->inuse = tmp.inuse;
+ page->objects = tmp.objects;
+}
+
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
@@ -376,7 +391,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- page->counters = counters_new;
+ set_page_slub_counters(page, counters_new);
slab_unlock(page);
return 1;
}
@@ -415,7 +430,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- page->counters = counters_new;
+ set_page_slub_counters(page, counters_new);
slab_unlock(page);
local_irq_restore(flags);
return 1;
@@ -1559,7 +1574,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
new.freelist = freelist;
}
- VM_BUG_ON_PAGE(new.frozen, &new);
+ VM_BUG_ON(new.frozen);
new.frozen = 1;
if (!__cmpxchg_double_slab(s, page,
@@ -1812,7 +1827,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
set_freepointer(s, freelist, prior);
new.counters = counters;
new.inuse--;
- VM_BUG_ON_PAGE(!new.frozen, &new);
+ VM_BUG_ON(!new.frozen);
} while (!__cmpxchg_double_slab(s, page,
prior, counters,
@@ -1840,7 +1855,7 @@ redo:
old.freelist = page->freelist;
old.counters = page->counters;
- VM_BUG_ON_PAGE(!old.frozen, &old);
+ VM_BUG_ON(!old.frozen);
/* Determine target state of the slab */
new.counters = old.counters;
@@ -1952,7 +1967,7 @@ static void unfreeze_partials(struct kmem_cache *s,
old.freelist = page->freelist;
old.counters = page->counters;
- VM_BUG_ON_PAGE(!old.frozen, &old);
+ VM_BUG_ON(!old.frozen);
new.counters = old.counters;
new.freelist = old.freelist;
@@ -2225,7 +2240,7 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
counters = page->counters;
new.counters = counters;
- VM_BUG_ON_PAGE(!new.frozen, &new);
+ VM_BUG_ON(!new.frozen);
new.inuse = page->objects;
new.frozen = freelist != NULL;
@@ -2319,7 +2334,7 @@ load_freelist:
* page is pointing to the page from which the objects are obtained.
* That page must be frozen for per cpu allocations to work.
*/
- VM_BUG_ON_PAGE(!c->page->frozen, c->page);
+ VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e4f0db2a3eae..0fdf96803c5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -220,12 +220,12 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the physical pfn it maps to.
+ * Walk a vmap address to the struct page it maps.
*/
-unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+struct page *vmalloc_to_page(const void *vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
- unsigned long pfn = 0;
+ struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
/*
@@ -244,23 +244,23 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
ptep = pte_offset_map(pmd, addr);
pte = *ptep;
if (pte_present(pte))
- pfn = pte_pfn(pte);
+ page = pte_page(pte);
pte_unmap(ptep);
}
}
}
- return pfn;
+ return page;
}
-EXPORT_SYMBOL(vmalloc_to_pfn);
+EXPORT_SYMBOL(vmalloc_to_page);
/*
- * Map a vmalloc()-space virtual address to the struct page.
+ * Map a vmalloc()-space virtual address to the physical page frame number.
*/
-struct page *vmalloc_to_page(const void *vmalloc_addr)
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
- return pfn_to_page(vmalloc_to_pfn(vmalloc_addr));
+ return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
-EXPORT_SYMBOL(vmalloc_to_page);
+EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 90c4075d8d75..a9c74b409681 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_control *sc)
}
#endif
-unsigned long zone_reclaimable_pages(struct zone *zone)
+static unsigned long zone_reclaimable_pages(struct zone *zone)
{
int nr;
@@ -3315,27 +3315,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
wake_up_interruptible(&pgdat->kswapd_wait);
}
-/*
- * The reclaimable count would be mostly accurate.
- * The less reclaimable pages may be
- * - mlocked pages, which will be moved to unevictable list when encountered
- * - mapped pages, which may require several travels to be reclaimed
- * - dirty pages, which is not "instantly" reclaimable
- */
-unsigned long global_reclaimable_pages(void)
-{
- int nr;
-
- nr = global_page_state(NR_ACTIVE_FILE) +
- global_page_state(NR_INACTIVE_FILE);
-
- if (get_nr_swap_pages() > 0)
- nr += global_page_state(NR_ACTIVE_ANON) +
- global_page_state(NR_INACTIVE_ANON);
-
- return nr;
-}
-
#ifdef CONFIG_HIBERNATION
/*
* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
new file mode 100644
index 000000000000..c03ca5e9fe15
--- /dev/null
+++ b/mm/zsmalloc.c
@@ -0,0 +1,1106 @@
+/*
+ * zsmalloc memory allocator
+ *
+ * Copyright (C) 2011 Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the license that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+/*
+ * This allocator is designed for use with zram. Thus, the allocator is
+ * supposed to work well under low memory conditions. In particular, it
+ * never attempts higher order page allocation which is very likely to
+ * fail under memory pressure. On the other hand, if we just use single
+ * (0-order) pages, it would suffer from very high fragmentation --
+ * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
+ * This was one of the major issues with its predecessor (xvmalloc).
+ *
+ * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
+ * and links them together using various 'struct page' fields. These linked
+ * pages act as a single higher-order page i.e. an object can span 0-order
+ * page boundaries. The code refers to these linked pages as a single entity
+ * called zspage.
+ *
+ * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
+ * since this satisfies the requirements of all its current users (in the
+ * worst case, page is incompressible and is thus stored "as-is" i.e. in
+ * uncompressed form). For allocation requests larger than this size, failure
+ * is returned (see zs_malloc).
+ *
+ * Additionally, zs_malloc() does not return a dereferenceable pointer.
+ * Instead, it returns an opaque handle (unsigned long) which encodes actual
+ * location of the allocated object. The reason for this indirection is that
+ * zsmalloc does not keep zspages permanently mapped since that would cause
+ * issues on 32-bit systems where the VA region for kernel space mappings
+ * is very small. So, before using the allocating memory, the object has to
+ * be mapped using zs_map_object() to get a usable pointer and subsequently
+ * unmapped using zs_unmap_object().
+ *
+ * Following is how we use various fields and flags of underlying
+ * struct page(s) to form a zspage.
+ *
+ * Usage of struct page fields:
+ * page->first_page: points to the first component (0-order) page
+ * page->index (union with page->freelist): offset of the first object
+ * starting in this page. For the first page, this is
+ * always 0, so we use this field (aka freelist) to point
+ * to the first free object in zspage.
+ * page->lru: links together all component pages (except the first page)
+ * of a zspage
+ *
+ * For _first_ page only:
+ *
+ * page->private (union with page->first_page): refers to the
+ * component page after the first page
+ * page->freelist: points to the first free object in zspage.
+ * Free objects are linked together using in-place
+ * metadata.
+ * page->objects: maximum number of objects we can store in this
+ * zspage (class->zspage_order * PAGE_SIZE / class->size)
+ * page->lru: links together first pages of various zspages.
+ * Basically forming list of zspages in a fullness group.
+ * page->mapping: class index and fullness group of the zspage
+ *
+ * Usage of struct page flags:
+ * PG_private: identifies the first component page
+ * PG_private2: identifies the last component page
+ *
+ */
+
+#ifdef CONFIG_ZSMALLOC_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <linux/cpumask.h>
+#include <linux/cpu.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/zsmalloc.h>
+
+/*
+ * This must be power of 2 and greater than of equal to sizeof(link_free).
+ * These two conditions ensure that any 'struct link_free' itself doesn't
+ * span more than 1 page which avoids complex case of mapping 2 pages simply
+ * to restore link_free pointer values.
+ */
+#define ZS_ALIGN 8
+
+/*
+ * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
+ * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
+ */
+#define ZS_MAX_ZSPAGE_ORDER 2
+#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+
+/*
+ * Object location (<PFN>, <obj_idx>) is encoded as
+ * as single (unsigned long) handle value.
+ *
+ * Note that object index <obj_idx> is relative to system
+ * page <PFN> it is stored in, so for each sub-page belonging
+ * to a zspage, obj_idx starts with 0.
+ *
+ * This is made more complicated by various memory models and PAE.
+ */
+
+#ifndef MAX_PHYSMEM_BITS
+#ifdef CONFIG_HIGHMEM64G
+#define MAX_PHYSMEM_BITS 36
+#else /* !CONFIG_HIGHMEM64G */
+/*
+ * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
+ * be PAGE_SHIFT
+ */
+#define MAX_PHYSMEM_BITS BITS_PER_LONG
+#endif
+#endif
+#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+
+#define MAX(a, b) ((a) >= (b) ? (a) : (b))
+/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
+#define ZS_MIN_ALLOC_SIZE \
+ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
+
+/*
+ * On systems with 4K page size, this gives 254 size classes! There is a
+ * trader-off here:
+ * - Large number of size classes is potentially wasteful as free page are
+ * spread across these classes
+ * - Small number of size classes causes large internal fragmentation
+ * - Probably its better to use specific size classes (empirically
+ * determined). NOTE: all those class sizes must be set as multiple of
+ * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
+ *
+ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
+ * (reason above)
+ */
+#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
+ ZS_SIZE_CLASS_DELTA + 1)
+
+/*
+ * We do not maintain any list for completely empty or full pages
+ */
+enum fullness_group {
+ ZS_ALMOST_FULL,
+ ZS_ALMOST_EMPTY,
+ _ZS_NR_FULLNESS_GROUPS,
+
+ ZS_EMPTY,
+ ZS_FULL
+};
+
+/*
+ * We assign a page to ZS_ALMOST_EMPTY fullness group when:
+ * n <= N / f, where
+ * n = number of allocated objects
+ * N = total number of objects zspage can store
+ * f = 1/fullness_threshold_frac
+ *
+ * Similarly, we assign zspage to:
+ * ZS_ALMOST_FULL when n > N / f
+ * ZS_EMPTY when n == 0
+ * ZS_FULL when n == N
+ *
+ * (see: fix_fullness_group())
+ */
+static const int fullness_threshold_frac = 4;
+
+struct size_class {
+ /*
+ * Size of objects stored in this class. Must be multiple
+ * of ZS_ALIGN.
+ */
+ int size;
+ unsigned int index;
+
+ /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+ int pages_per_zspage;
+
+ spinlock_t lock;
+
+ /* stats */
+ u64 pages_allocated;
+
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+};
+
+/*
+ * Placed within free objects to form a singly linked list.
+ * For every zspage, first_page->freelist gives head of this list.
+ *
+ * This must be power of 2 and less than or equal to ZS_ALIGN
+ */
+struct link_free {
+ /* Handle of next free chunk (encodes <PFN, obj_idx>) */
+ void *next;
+};
+
+struct zs_pool {
+ struct size_class size_class[ZS_SIZE_CLASSES];
+
+ gfp_t flags; /* allocation flags used when growing pool */
+};
+
+/*
+ * A zspage's class index and fullness group
+ * are encoded in its (first)page->mapping
+ */
+#define CLASS_IDX_BITS 28
+#define FULLNESS_BITS 4
+#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
+#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
+
+struct mapping_area {
+#ifdef CONFIG_PGTABLE_MAPPING
+ struct vm_struct *vm; /* vm area for mapping object that span pages */
+#else
+ char *vm_buf; /* copy buffer for objects that span pages */
+#endif
+ char *vm_addr; /* address of kmap_atomic()'ed pages */
+ enum zs_mapmode vm_mm; /* mapping mode */
+};
+
+
+/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+
+static int is_first_page(struct page *page)
+{
+ return PagePrivate(page);
+}
+
+static int is_last_page(struct page *page)
+{
+ return PagePrivate2(page);
+}
+
+static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+ enum fullness_group *fullness)
+{
+ unsigned long m;
+ BUG_ON(!is_first_page(page));
+
+ m = (unsigned long)page->mapping;
+ *fullness = m & FULLNESS_MASK;
+ *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+}
+
+static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+ enum fullness_group fullness)
+{
+ unsigned long m;
+ BUG_ON(!is_first_page(page));
+
+ m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
+ (fullness & FULLNESS_MASK);
+ page->mapping = (struct address_space *)m;
+}
+
+/*
+ * zsmalloc divides the pool into various size classes where each
+ * class maintains a list of zspages where each zspage is divided
+ * into equal sized chunks. Each allocation falls into one of these
+ * classes depending on its size. This function returns index of the
+ * size class which has chunk size big enough to hold the give size.
+ */
+static int get_size_class_index(int size)
+{
+ int idx = 0;
+
+ if (likely(size > ZS_MIN_ALLOC_SIZE))
+ idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
+ ZS_SIZE_CLASS_DELTA);
+
+ return idx;
+}
+
+/*
+ * For each size class, zspages are divided into different groups
+ * depending on how "full" they are. This was done so that we could
+ * easily find empty or nearly empty zspages when we try to shrink
+ * the pool (not yet implemented). This function returns fullness
+ * status of the given page.
+ */
+static enum fullness_group get_fullness_group(struct page *page)
+{
+ int inuse, max_objects;
+ enum fullness_group fg;
+ BUG_ON(!is_first_page(page));
+
+ inuse = page->inuse;
+ max_objects = page->objects;
+
+ if (inuse == 0)
+ fg = ZS_EMPTY;
+ else if (inuse == max_objects)
+ fg = ZS_FULL;
+ else if (inuse <= max_objects / fullness_threshold_frac)
+ fg = ZS_ALMOST_EMPTY;
+ else
+ fg = ZS_ALMOST_FULL;
+
+ return fg;
+}
+
+/*
+ * Each size class maintains various freelists and zspages are assigned
+ * to one of these freelists based on the number of live objects they
+ * have. This functions inserts the given zspage into the freelist
+ * identified by <class, fullness_group>.
+ */
+static void insert_zspage(struct page *page, struct size_class *class,
+ enum fullness_group fullness)
+{
+ struct page **head;
+
+ BUG_ON(!is_first_page(page));
+
+ if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+ return;
+
+ head = &class->fullness_list[fullness];
+ if (*head)
+ list_add_tail(&page->lru, &(*head)->lru);
+
+ *head = page;
+}
+
+/*
+ * This function removes the given zspage from the freelist identified
+ * by <class, fullness_group>.
+ */
+static void remove_zspage(struct page *page, struct size_class *class,
+ enum fullness_group fullness)
+{
+ struct page **head;
+
+ BUG_ON(!is_first_page(page));
+
+ if (fullness >= _ZS_NR_FULLNESS_GROUPS)
+ return;
+
+ head = &class->fullness_list[fullness];
+ BUG_ON(!*head);
+ if (list_empty(&(*head)->lru))
+ *head = NULL;
+ else if (*head == page)
+ *head = (struct page *)list_entry((*head)->lru.next,
+ struct page, lru);
+
+ list_del_init(&page->lru);
+}
+
+/*
+ * Each size class maintains zspages in different fullness groups depending
+ * on the number of live objects they contain. When allocating or freeing
+ * objects, the fullness status of the page can change, say, from ALMOST_FULL
+ * to ALMOST_EMPTY when freeing an object. This function checks if such
+ * a status change has occurred for the given page and accordingly moves the
+ * page from the freelist of the old fullness group to that of the new
+ * fullness group.
+ */
+static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+ struct page *page)
+{
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group currfg, newfg;
+
+ BUG_ON(!is_first_page(page));
+
+ get_zspage_mapping(page, &class_idx, &currfg);
+ newfg = get_fullness_group(page);
+ if (newfg == currfg)
+ goto out;
+
+ class = &pool->size_class[class_idx];
+ remove_zspage(page, class, currfg);
+ insert_zspage(page, class, newfg);
+ set_zspage_mapping(page, class_idx, newfg);
+
+out:
+ return newfg;
+}
+
+/*
+ * We have to decide on how many pages to link together
+ * to form a zspage for each size class. This is important
+ * to reduce wastage due to unusable space left at end of
+ * each zspage which is given as:
+ * wastage = Zp - Zp % size_class
+ * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
+ *
+ * For example, for size class of 3/8 * PAGE_SIZE, we should
+ * link together 3 PAGE_SIZE sized pages to form a zspage
+ * since then we can perfectly fit in 8 such objects.
+ */
+static int get_pages_per_zspage(int class_size)
+{
+ int i, max_usedpc = 0;
+ /* zspage order which gives maximum used size per KB */
+ int max_usedpc_order = 1;
+
+ for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
+ int zspage_size;
+ int waste, usedpc;
+
+ zspage_size = i * PAGE_SIZE;
+ waste = zspage_size % class_size;
+ usedpc = (zspage_size - waste) * 100 / zspage_size;
+
+ if (usedpc > max_usedpc) {
+ max_usedpc = usedpc;
+ max_usedpc_order = i;
+ }
+ }
+
+ return max_usedpc_order;
+}
+
+/*
+ * A single 'zspage' is composed of many system pages which are
+ * linked together using fields in struct page. This function finds
+ * the first/head page, given any component page of a zspage.
+ */
+static struct page *get_first_page(struct page *page)
+{
+ if (is_first_page(page))
+ return page;
+ else
+ return page->first_page;
+}
+
+static struct page *get_next_page(struct page *page)
+{
+ struct page *next;
+
+ if (is_last_page(page))
+ next = NULL;
+ else if (is_first_page(page))
+ next = (struct page *)page_private(page);
+ else
+ next = list_entry(page->lru.next, struct page, lru);
+
+ return next;
+}
+
+/*
+ * Encode <page, obj_idx> as a single handle value.
+ * On hardware platforms with physical memory starting at 0x0 the pfn
+ * could be 0 so we ensure that the handle will never be 0 by adjusting the
+ * encoded obj_idx value before encoding.
+ */
+static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+{
+ unsigned long handle;
+
+ if (!page) {
+ BUG_ON(obj_idx);
+ return NULL;
+ }
+
+ handle = page_to_pfn(page) << OBJ_INDEX_BITS;
+ handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+
+ return (void *)handle;
+}
+
+/*
+ * Decode <page, obj_idx> pair from the given object handle. We adjust the
+ * decoded obj_idx back to its original value since it was adjusted in
+ * obj_location_to_handle().
+ */
+static void obj_handle_to_location(unsigned long handle, struct page **page,
+ unsigned long *obj_idx)
+{
+ *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
+ *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+}
+
+static unsigned long obj_idx_to_offset(struct page *page,
+ unsigned long obj_idx, int class_size)
+{
+ unsigned long off = 0;
+
+ if (!is_first_page(page))
+ off = page->index;
+
+ return off + obj_idx * class_size;
+}
+
+static void reset_page(struct page *page)
+{
+ clear_bit(PG_private, &page->flags);
+ clear_bit(PG_private_2, &page->flags);
+ set_page_private(page, 0);
+ page->mapping = NULL;
+ page->freelist = NULL;
+ page_mapcount_reset(page);
+}
+
+static void free_zspage(struct page *first_page)
+{
+ struct page *nextp, *tmp, *head_extra;
+
+ BUG_ON(!is_first_page(first_page));
+ BUG_ON(first_page->inuse);
+
+ head_extra = (struct page *)page_private(first_page);
+
+ reset_page(first_page);
+ __free_page(first_page);
+
+ /* zspage with only 1 system page */
+ if (!head_extra)
+ return;
+
+ list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
+ list_del(&nextp->lru);
+ reset_page(nextp);
+ __free_page(nextp);
+ }
+ reset_page(head_extra);
+ __free_page(head_extra);
+}
+
+/* Initialize a newly allocated zspage */
+static void init_zspage(struct page *first_page, struct size_class *class)
+{
+ unsigned long off = 0;
+ struct page *page = first_page;
+
+ BUG_ON(!is_first_page(first_page));
+ while (page) {
+ struct page *next_page;
+ struct link_free *link;
+ unsigned int i, objs_on_page;
+
+ /*
+ * page->index stores offset of first object starting
+ * in the page. For the first page, this is always 0,
+ * so we use first_page->index (aka ->freelist) to store
+ * head of corresponding zspage's freelist.
+ */
+ if (page != first_page)
+ page->index = off;
+
+ link = (struct link_free *)kmap_atomic(page) +
+ off / sizeof(*link);
+ objs_on_page = (PAGE_SIZE - off) / class->size;
+
+ for (i = 1; i <= objs_on_page; i++) {
+ off += class->size;
+ if (off < PAGE_SIZE) {
+ link->next = obj_location_to_handle(page, i);
+ link += class->size / sizeof(*link);
+ }
+ }
+
+ /*
+ * We now come to the last (full or partial) object on this
+ * page, which must point to the first object on the next
+ * page (if present)
+ */
+ next_page = get_next_page(page);
+ link->next = obj_location_to_handle(next_page, 0);
+ kunmap_atomic(link);
+ page = next_page;
+ off = (off + class->size) % PAGE_SIZE;
+ }
+}
+
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+{
+ int i, error;
+ struct page *first_page = NULL, *uninitialized_var(prev_page);
+
+ /*
+ * Allocate individual pages and link them together as:
+ * 1. first page->private = first sub-page
+ * 2. all sub-pages are linked together using page->lru
+ * 3. each sub-page is linked to the first page using page->first_page
+ *
+ * For each size class, First/Head pages are linked together using
+ * page->lru. Also, we set PG_private to identify the first page
+ * (i.e. no other sub-page has this flag set) and PG_private_2 to
+ * identify the last page.
+ */
+ error = -ENOMEM;
+ for (i = 0; i < class->pages_per_zspage; i++) {
+ struct page *page;
+
+ page = alloc_page(flags);
+ if (!page)
+ goto cleanup;
+
+ INIT_LIST_HEAD(&page->lru);
+ if (i == 0) { /* first page */
+ SetPagePrivate(page);
+ set_page_private(page, 0);
+ first_page = page;
+ first_page->inuse = 0;
+ }
+ if (i == 1)
+ set_page_private(first_page, (unsigned long)page);
+ if (i >= 1)
+ page->first_page = first_page;
+ if (i >= 2)
+ list_add(&page->lru, &prev_page->lru);
+ if (i == class->pages_per_zspage - 1) /* last page */
+ SetPagePrivate2(page);
+ prev_page = page;
+ }
+
+ init_zspage(first_page, class);
+
+ first_page->freelist = obj_location_to_handle(first_page, 0);
+ /* Maximum number of objects we can store in this zspage */
+ first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+
+ error = 0; /* Success */
+
+cleanup:
+ if (unlikely(error) && first_page) {
+ free_zspage(first_page);
+ first_page = NULL;
+ }
+
+ return first_page;
+}
+
+static struct page *find_get_zspage(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page)
+ break;
+ }
+
+ return page;
+}
+
+#ifdef CONFIG_PGTABLE_MAPPING
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm)
+ return 0;
+ area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+ if (!area->vm)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ if (area->vm)
+ free_vm_area(area->vm);
+ area->vm = NULL;
+}
+
+static inline void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
+ area->vm_addr = area->vm->addr;
+ return area->vm_addr + off;
+}
+
+static inline void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ unsigned long addr = (unsigned long)area->vm_addr;
+
+ unmap_kernel_range(addr, PAGE_SIZE * 2);
+}
+
+#else /* CONFIG_PGTABLE_MAPPING */
+
+static inline int __zs_cpu_up(struct mapping_area *area)
+{
+ /*
+ * Make sure we don't leak memory if a cpu UP notification
+ * and zs_init() race and both call zs_cpu_up() on the same cpu
+ */
+ if (area->vm_buf)
+ return 0;
+ area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+ if (!area->vm_buf)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline void __zs_cpu_down(struct mapping_area *area)
+{
+ if (area->vm_buf)
+ free_page((unsigned long)area->vm_buf);
+ area->vm_buf = NULL;
+}
+
+static void *__zs_map_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* disable page faults to match kmap_atomic() return conditions */
+ pagefault_disable();
+
+ /* no read fastpath */
+ if (area->vm_mm == ZS_MM_WO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy object to per-cpu buffer */
+ addr = kmap_atomic(pages[0]);
+ memcpy(buf, addr + off, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(buf + sizes[0], addr, sizes[1]);
+ kunmap_atomic(addr);
+out:
+ return area->vm_buf;
+}
+
+static void __zs_unmap_object(struct mapping_area *area,
+ struct page *pages[2], int off, int size)
+{
+ int sizes[2];
+ void *addr;
+ char *buf = area->vm_buf;
+
+ /* no write fastpath */
+ if (area->vm_mm == ZS_MM_RO)
+ goto out;
+
+ sizes[0] = PAGE_SIZE - off;
+ sizes[1] = size - sizes[0];
+
+ /* copy per-cpu buffer to object */
+ addr = kmap_atomic(pages[0]);
+ memcpy(addr + off, buf, sizes[0]);
+ kunmap_atomic(addr);
+ addr = kmap_atomic(pages[1]);
+ memcpy(addr, buf + sizes[0], sizes[1]);
+ kunmap_atomic(addr);
+
+out:
+ /* enable page faults to match kunmap_atomic() return conditions */
+ pagefault_enable();
+}
+
+#endif /* CONFIG_PGTABLE_MAPPING */
+
+static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
+ void *pcpu)
+{
+ int ret, cpu = (long)pcpu;
+ struct mapping_area *area;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ area = &per_cpu(zs_map_area, cpu);
+ ret = __zs_cpu_up(area);
+ if (ret)
+ return notifier_from_errno(ret);
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ area = &per_cpu(zs_map_area, cpu);
+ __zs_cpu_down(area);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block zs_cpu_nb = {
+ .notifier_call = zs_cpu_notifier
+};
+
+static void zs_exit(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
+ unregister_cpu_notifier(&zs_cpu_nb);
+}
+
+static int zs_init(void)
+{
+ int cpu, ret;
+
+ register_cpu_notifier(&zs_cpu_nb);
+ for_each_online_cpu(cpu) {
+ ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+ if (notifier_to_errno(ret))
+ goto fail;
+ }
+ return 0;
+fail:
+ zs_exit();
+ return notifier_to_errno(ret);
+}
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(gfp_t flags)
+{
+ int i, ovhd_size;
+ struct zs_pool *pool;
+
+ ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+ pool = kzalloc(ovhd_size, GFP_KERNEL);
+ if (!pool)
+ return NULL;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int size;
+ struct size_class *class;
+
+ size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+ if (size > ZS_MAX_ALLOC_SIZE)
+ size = ZS_MAX_ALLOC_SIZE;
+
+ class = &pool->size_class[i];
+ class->size = size;
+ class->index = i;
+ spin_lock_init(&class->lock);
+ class->pages_per_zspage = get_pages_per_zspage(size);
+
+ }
+
+ pool->flags = flags;
+
+ return pool;
+}
+EXPORT_SYMBOL_GPL(zs_create_pool);
+
+void zs_destroy_pool(struct zs_pool *pool)
+{
+ int i;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+ int fg;
+ struct size_class *class = &pool->size_class[i];
+
+ for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+ if (class->fullness_list[fg]) {
+ pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+ class->size, fg);
+ }
+ }
+ }
+ kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+/**
+ * zs_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ *
+ * On success, handle to the allocated object is returned,
+ * otherwise 0.
+ * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
+ */
+unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+{
+ unsigned long obj;
+ struct link_free *link;
+ int class_idx;
+ struct size_class *class;
+
+ struct page *first_page, *m_page;
+ unsigned long m_objidx, m_offset;
+
+ if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
+ return 0;
+
+ class_idx = get_size_class_index(size);
+ class = &pool->size_class[class_idx];
+ BUG_ON(class_idx != class->index);
+
+ spin_lock(&class->lock);
+ first_page = find_get_zspage(class);
+
+ if (!first_page) {
+ spin_unlock(&class->lock);
+ first_page = alloc_zspage(class, pool->flags);
+ if (unlikely(!first_page))
+ return 0;
+
+ set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+ spin_lock(&class->lock);
+ class->pages_allocated += class->pages_per_zspage;
+ }
+
+ obj = (unsigned long)first_page->freelist;
+ obj_handle_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ link = (struct link_free *)kmap_atomic(m_page) +
+ m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ memset(link, POISON_INUSE, sizeof(*link));
+ kunmap_atomic(link);
+
+ first_page->inuse++;
+ /* Now move the zspage to another fullness group, if required */
+ fix_fullness_group(pool, first_page);
+ spin_unlock(&class->lock);
+
+ return obj;
+}
+EXPORT_SYMBOL_GPL(zs_malloc);
+
+void zs_free(struct zs_pool *pool, unsigned long obj)
+{
+ struct link_free *link;
+ struct page *first_page, *f_page;
+ unsigned long f_objidx, f_offset;
+
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!obj))
+ return;
+
+ obj_handle_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = &pool->size_class[class_idx];
+ f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+
+ spin_lock(&class->lock);
+
+ /* Insert this object in containing zspage's freelist */
+ link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
+ + f_offset);
+ link->next = first_page->freelist;
+ kunmap_atomic(link);
+ first_page->freelist = (void *)obj;
+
+ first_page->inuse--;
+ fullness = fix_fullness_group(pool, first_page);
+
+ if (fullness == ZS_EMPTY)
+ class->pages_allocated -= class->pages_per_zspage;
+
+ spin_unlock(&class->lock);
+
+ if (fullness == ZS_EMPTY)
+ free_zspage(first_page);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+/**
+ * zs_map_object - get address of allocated object from handle.
+ * @pool: pool from which the object was allocated
+ * @handle: handle returned from zs_malloc
+ *
+ * Before using an object allocated from zs_malloc, it must be mapped using
+ * this function. When done with the object, it must be unmapped using
+ * zs_unmap_object.
+ *
+ * Only one object can be mapped per cpu at a time. There is no protection
+ * against nested mappings.
+ *
+ * This function returns with preemption and page faults disabled.
+ */
+void *zs_map_object(struct zs_pool *pool, unsigned long handle,
+ enum zs_mapmode mm)
+{
+ struct page *page;
+ unsigned long obj_idx, off;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+ struct page *pages[2];
+
+ BUG_ON(!handle);
+
+ /*
+ * Because we use per-cpu mapping areas shared among the
+ * pools/users, we can't allow mapping in interrupt context
+ * because it can corrupt another users mappings.
+ */
+ BUG_ON(in_interrupt());
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = &get_cpu_var(zs_map_area);
+ area->vm_mm = mm;
+ if (off + class->size <= PAGE_SIZE) {
+ /* this object is contained entirely within a page */
+ area->vm_addr = kmap_atomic(page);
+ return area->vm_addr + off;
+ }
+
+ /* this object spans two pages */
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ return __zs_map_object(area, pages, off, class->size);
+}
+EXPORT_SYMBOL_GPL(zs_map_object);
+
+void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *page;
+ unsigned long obj_idx, off;
+
+ unsigned int class_idx;
+ enum fullness_group fg;
+ struct size_class *class;
+ struct mapping_area *area;
+
+ BUG_ON(!handle);
+
+ obj_handle_to_location(handle, &page, &obj_idx);
+ get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ class = &pool->size_class[class_idx];
+ off = obj_idx_to_offset(page, obj_idx, class->size);
+
+ area = &__get_cpu_var(zs_map_area);
+ if (off + class->size <= PAGE_SIZE)
+ kunmap_atomic(area->vm_addr);
+ else {
+ struct page *pages[2];
+
+ pages[0] = page;
+ pages[1] = get_next_page(page);
+ BUG_ON(!pages[1]);
+
+ __zs_unmap_object(area, pages, off, class->size);
+ }
+ put_cpu_var(zs_map_area);
+}
+EXPORT_SYMBOL_GPL(zs_unmap_object);
+
+u64 zs_get_total_size_bytes(struct zs_pool *pool)
+{
+ int i;
+ u64 npages = 0;
+
+ for (i = 0; i < ZS_SIZE_CLASSES; i++)
+ npages += pool->size_class[i].pages_allocated;
+
+ return npages << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+
+module_init(zs_init);
+module_exit(zs_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");