From 6bbdd563ee9a6078725727571586c66c8613db64 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 3 Mar 2020 14:58:21 -0500
Subject: dax: Create a range version of dax_layout_busy_page()

virtiofs device has a range of memory which is mapped into file inodes
using dax. This memory is mapped in qemu on host and maps different
sections of real file on host. Size of this memory is limited
(determined by administrator) and depending on filesystem size, we will
soon reach a situation where all the memory is in use and we need to
reclaim some.

As part of reclaim process, we will need to make sure that there are
no active references to pages (taken by get_user_pages()) on the memory
range we are trying to reclaim. I am planning to use
dax_layout_busy_page() for this. But in current form this is per inode
and scans through all the pages of the inode.

We want to reclaim only a portion of memory (say 2MB page). So we want
to make sure that only that 2MB range of pages do not have any
references  (and don't want to unmap all the pages of inode).

Hence, create a range version of this function named
dax_layout_busy_page_range() which can be used to pass a range which
needs to be unmapped.

Cc: Dan Williams <dan.j.williams@intel.com>
Cc: linux-nvdimm@lists.01.org
Cc: Jan Kara <jack@suse.cz>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Cc: "Weiny, Ira" <ira.weiny@intel.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/dax.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 6904d4e0b2e0..9016929db4c6 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -141,6 +141,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc);
 
 struct page *dax_layout_busy_page(struct address_space *mapping);
+struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
 #else
@@ -171,6 +172,11 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping)
 	return NULL;
 }
 
+static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
+{
+	return NULL;
+}
+
 static inline int dax_writeback_mapping_range(struct address_space *mapping,
 		struct dax_device *dax_dev, struct writeback_control *wbc)
 {
-- 
cgit v1.2.3


From 22f3787e9d95e72d1f09795f294fb010e2998f43 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 19 Aug 2020 18:19:46 -0400
Subject: virtiofs: set up virtio_fs dax_device

Setup a dax device.

Use the shm capability to find the cache entry and map it.

The DAX window is accessed by the fs/dax.c infrastructure and must have
struct pages (at least on x86).  Use devm_memremap_pages() to map the
DAX window PCI BAR and allocate struct page.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Sebastien Boeuf <sebastien.boeuf@intel.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/virtio_fs.c            | 138 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/virtio_fs.h |   3 +
 2 files changed, 141 insertions(+)

(limited to 'include')

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 47ecdc15f25d..f31a59f74475 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -5,12 +5,16 @@
  */
 
 #include <linux/fs.h>
+#include <linux/dax.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_fs.h>
 #include <linux/delay.h>
 #include <linux/fs_context.h>
 #include <linux/highmem.h>
+#include <linux/uio.h>
 #include "fuse_i.h"
 
 /* List of virtio-fs device instances and a lock for the list. Also provides
@@ -49,6 +53,12 @@ struct virtio_fs {
 	struct virtio_fs_vq *vqs;
 	unsigned int nvqs;               /* number of virtqueues */
 	unsigned int num_request_queues; /* number of request queues */
+	struct dax_device *dax_dev;
+
+	/* DAX memory window where file contents are mapped */
+	void *window_kaddr;
+	phys_addr_t window_phys_addr;
+	size_t window_len;
 };
 
 struct virtio_fs_forget_req {
@@ -686,6 +696,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
 	vdev->config->del_vqs(vdev);
 }
 
+/* Map a window offset to a page frame number.  The window offset will have
+ * been produced by .iomap_begin(), which maps a file offset to a window
+ * offset.
+ */
+static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+				    long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct virtio_fs *fs = dax_get_private(dax_dev);
+	phys_addr_t offset = PFN_PHYS(pgoff);
+	size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
+
+	if (kaddr)
+		*kaddr = fs->window_kaddr + offset;
+	if (pfn)
+		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
+					PFN_DEV | PFN_MAP);
+	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
+}
+
+static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev,
+				       pgoff_t pgoff, void *addr,
+				       size_t bytes, struct iov_iter *i)
+{
+	return copy_from_iter(addr, bytes, i);
+}
+
+static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev,
+				       pgoff_t pgoff, void *addr,
+				       size_t bytes, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
+static int virtio_fs_zero_page_range(struct dax_device *dax_dev,
+				     pgoff_t pgoff, size_t nr_pages)
+{
+	long rc;
+	void *kaddr;
+
+	rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL);
+	if (rc < 0)
+		return rc;
+	memset(kaddr, 0, nr_pages << PAGE_SHIFT);
+	dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT);
+	return 0;
+}
+
+static const struct dax_operations virtio_fs_dax_ops = {
+	.direct_access = virtio_fs_direct_access,
+	.copy_from_iter = virtio_fs_copy_from_iter,
+	.copy_to_iter = virtio_fs_copy_to_iter,
+	.zero_page_range = virtio_fs_zero_page_range,
+};
+
+static void virtio_fs_cleanup_dax(void *data)
+{
+	struct dax_device *dax_dev = data;
+
+	kill_dax(dax_dev);
+	put_dax(dax_dev);
+}
+
+static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	struct virtio_shm_region cache_reg;
+	struct dev_pagemap *pgmap;
+	bool have_cache;
+
+	if (!IS_ENABLED(CONFIG_FUSE_DAX))
+		return 0;
+
+	/* Get cache region */
+	have_cache = virtio_get_shm_region(vdev, &cache_reg,
+					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
+	if (!have_cache) {
+		dev_notice(&vdev->dev, "%s: No cache capability\n", __func__);
+		return 0;
+	}
+
+	if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len,
+				     dev_name(&vdev->dev))) {
+		dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n",
+			 cache_reg.addr, cache_reg.len);
+		return -EBUSY;
+	}
+
+	dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len,
+		   cache_reg.addr);
+
+	pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL);
+	if (!pgmap)
+		return -ENOMEM;
+
+	pgmap->type = MEMORY_DEVICE_FS_DAX;
+
+	/* Ideally we would directly use the PCI BAR resource but
+	 * devm_memremap_pages() wants its own copy in pgmap.  So
+	 * initialize a struct resource from scratch (only the start
+	 * and end fields will be used).
+	 */
+	pgmap->res = (struct resource){
+		.name = "virtio-fs dax window",
+		.start = (phys_addr_t) cache_reg.addr,
+		.end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1,
+	};
+
+	fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap);
+	if (IS_ERR(fs->window_kaddr))
+		return PTR_ERR(fs->window_kaddr);
+
+	fs->window_phys_addr = (phys_addr_t) cache_reg.addr;
+	fs->window_len = (phys_addr_t) cache_reg.len;
+
+	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
+		__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
+
+	fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+	if (IS_ERR(fs->dax_dev))
+		return PTR_ERR(fs->dax_dev);
+
+	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
+					fs->dax_dev);
+}
+
 static int virtio_fs_probe(struct virtio_device *vdev)
 {
 	struct virtio_fs *fs;
@@ -707,6 +841,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
 
 	/* TODO vq affinity */
 
+	ret = virtio_fs_setup_dax(vdev, fs);
+	if (ret < 0)
+		goto out_vqs;
+
 	/* Bring the device online in case the filesystem is mounted and
 	 * requests need to be sent before we return.
 	 */
diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h
index 3056b6e9f8ce..bea38291421b 100644
--- a/include/uapi/linux/virtio_fs.h
+++ b/include/uapi/linux/virtio_fs.h
@@ -16,4 +16,7 @@ struct virtio_fs_config {
 	__le32 num_request_queues;
 } __attribute__((packed));
 
+/* For the id field in virtio_pci_shm_cap */
+#define VIRTIO_FS_SHMCAP_ID_CACHE 0
+
 #endif /* _UAPI_LINUX_VIRTIO_FS_H */
-- 
cgit v1.2.3


From fd1a1dc6f5aa7361e3562790336e116935f8fcfa Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 19 Aug 2020 18:19:49 -0400
Subject: virtiofs: implement FUSE_INIT map_alignment field

The device communicates FUSE_SETUPMAPPING/FUSE_REMOVMAPPING alignment
constraints via the FUST_INIT map_alignment field.  Parse this field and
ensure our DAX mappings meet the alignment constraints.

We don't actually align anything differently since our mappings are
already 2MB aligned.  Just check the value when the connection is
established.  If it becomes necessary to honor arbitrary alignments in
the future we'll have to adjust how mappings are sized.

The upshot of this commit is that we can be confident that mappings will
work even when emulating x86 on Power and similar combinations where the
host page sizes are different.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/dax.c             | 15 ++++++++++++++-
 fs/fuse/fuse_i.h          |  1 +
 fs/fuse/inode.c           | 17 ++++++++++++++++-
 include/uapi/linux/fuse.h |  4 +++-
 4 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 031106020f75..fec8a2bd75b3 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -9,7 +9,10 @@
 #include <linux/dax.h>
 #include <linux/pfn_t.h>
 
-/* Default memory range size, 2MB */
+/*
+ * Default memory range size.  A power of 2 so it agrees with common FUSE_INIT
+ * map_alignment values 4KB and 64KB.
+ */
 #define FUSE_DAX_SHIFT	21
 #define FUSE_DAX_SZ	(1 << FUSE_DAX_SHIFT)
 #define FUSE_DAX_PAGES	(FUSE_DAX_SZ / PAGE_SIZE)
@@ -123,3 +126,13 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
 	fc->dax = fcd;
 	return 0;
 }
+
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
+{
+	if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
+		pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
+			map_alignment, FUSE_DAX_SZ);
+		return false;
+	}
+	return true;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 97af7952373a..2f3f04aa64c7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1106,5 +1106,6 @@ void fuse_free_conn(struct fuse_conn *fc);
 
 int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
 void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
 
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1780dfe063ab..67e99cee5a4f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -908,9 +908,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
 {
 	struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
 	struct fuse_init_out *arg = &ia->out;
+	bool ok = true;
 
 	if (error || arg->major != FUSE_KERNEL_VERSION)
-		fc->conn_error = 1;
+		ok = false;
 	else {
 		unsigned long ra_pages;
 
@@ -973,6 +974,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
 					min_t(unsigned int, FUSE_MAX_MAX_PAGES,
 					max_t(unsigned int, arg->max_pages, 1));
 			}
+			if (IS_ENABLED(CONFIG_FUSE_DAX) &&
+			    arg->flags & FUSE_MAP_ALIGNMENT &&
+			    !fuse_dax_check_alignment(fc, arg->map_alignment)) {
+				ok = false;
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -988,6 +994,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
 	}
 	kfree(ia);
 
+	if (!ok) {
+		fc->conn_init = 0;
+		fc->conn_error = 1;
+	}
+
 	fuse_set_initialized(fc);
 	wake_up_all(&fc->blocked_waitq);
 }
@@ -1011,6 +1022,10 @@ void fuse_send_init(struct fuse_conn *fc)
 		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
 		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
+#ifdef CONFIG_FUSE_DAX
+	if (fc->dax)
+		ia->in.flags |= FUSE_MAP_ALIGNMENT;
+#endif
 	ia->args.opcode = FUSE_INIT;
 	ia->args.in_numargs = 1;
 	ia->args.in_args[0].size = sizeof(ia->in);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 373cada89815..5b85819e045f 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -313,7 +313,9 @@ struct fuse_file_lock {
  * FUSE_CACHE_SYMLINKS: cache READLINK responses
  * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
  * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
- * FUSE_MAP_ALIGNMENT: map_alignment field is valid
+ * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for
+ *		       foffset and moffset fields in struct
+ *		       fuse_setupmapping_out and fuse_removemapping_one.
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
-- 
cgit v1.2.3


From ceec02d4354a317cacce4b053a580ea3c7fc6cdc Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 19 Aug 2020 18:19:50 -0400
Subject: virtiofs: introduce setupmapping/removemapping commands

Introduce two new fuse commands to setup/remove memory mappings. This
will be used to setup/tear down file mapping in dax window.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/uapi/linux/fuse.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 5b85819e045f..60a7bfc787ce 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -894,4 +894,33 @@ struct fuse_copy_file_range_in {
 	uint64_t	flags;
 };
 
+#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
+struct fuse_setupmapping_in {
+	/* An already open handle */
+	uint64_t	fh;
+	/* Offset into the file to start the mapping */
+	uint64_t	foffset;
+	/* Length of mapping required */
+	uint64_t	len;
+	/* Flags, FUSE_SETUPMAPPING_FLAG_* */
+	uint64_t	flags;
+	/* Offset in Memory Window */
+	uint64_t	moffset;
+};
+
+struct fuse_removemapping_in {
+	/* number of fuse_removemapping_one follows */
+	uint32_t        count;
+};
+
+struct fuse_removemapping_one {
+	/* Offset into the dax window start the unmapping */
+	uint64_t        moffset;
+	/* Length of mapping required */
+	uint64_t	len;
+};
+
+#define FUSE_REMOVEMAPPING_MAX_ENTRY   \
+		(PAGE_SIZE / sizeof(struct fuse_removemapping_one))
+
 #endif /* _LINUX_FUSE_H */
-- 
cgit v1.2.3


From c2d0ad00d948de73c78f05d2b3e5bdfa605035cc Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 19 Aug 2020 18:19:51 -0400
Subject: virtiofs: implement dax read/write operations

This patch implements basic DAX support. mmap() is not implemented
yet and will come in later patches. This patch looks into implemeting
read/write.

We make use of interval tree to keep track of per inode dax mappings.

Do not use dax for file extending writes, instead just send WRITE message
to daemon (like we do for direct I/O path). This will keep write and
i_size change atomic w.r.t crash.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Peng Tao <tao.peng@linux.alibaba.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/fuse/Kconfig           |   1 +
 fs/fuse/dax.c             | 565 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/file.c            |  15 +-
 fs/fuse/fuse_i.h          |  15 ++
 fs/fuse/inode.c           |  21 +-
 include/uapi/linux/fuse.h |   1 +
 6 files changed, 612 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index fddd40630077..40ce9a1c12e5 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -42,6 +42,7 @@ config VIRTIO_FS
 config FUSE_DAX
 	bool "Virtio Filesystem Direct Host Memory Access support"
 	default y
+	select INTERVAL_TREE
 	depends on VIRTIO_FS
 	depends on FS_DAX
 	depends on DAX_DRIVER
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index fec8a2bd75b3..a8d311b2db8e 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -7,7 +7,10 @@
 #include "fuse_i.h"
 
 #include <linux/dax.h>
+#include <linux/uio.h>
 #include <linux/pfn_t.h>
+#include <linux/iomap.h>
+#include <linux/interval_tree.h>
 
 /*
  * Default memory range size.  A power of 2 so it agrees with common FUSE_INIT
@@ -22,22 +25,556 @@ struct fuse_dax_mapping {
 	/* Will connect in fcd->free_ranges to keep track of free memory */
 	struct list_head list;
 
+	/* For interval tree in file/inode */
+	struct interval_tree_node itn;
+
 	/** Position in DAX window */
 	u64 window_offset;
 
 	/** Length of mapping, in bytes */
 	loff_t length;
+
+	/* Is this mapping read-only or read-write */
+	bool writable;
+};
+
+/* Per-inode dax map */
+struct fuse_inode_dax {
+	/* Semaphore to protect modifications to the dmap tree */
+	struct rw_semaphore sem;
+
+	/* Sorted rb tree of struct fuse_dax_mapping elements */
+	struct rb_root_cached tree;
+	unsigned long nr;
 };
 
 struct fuse_conn_dax {
 	/* DAX device */
 	struct dax_device *dev;
 
+	/* Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
 	/* DAX Window Free Ranges */
 	long nr_free_ranges;
 	struct list_head free_ranges;
 };
 
+static inline struct fuse_dax_mapping *
+node_to_dmap(struct interval_tree_node *node)
+{
+	if (!node)
+		return NULL;
+
+	return container_of(node, struct fuse_dax_mapping, itn);
+}
+
+static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
+{
+	struct fuse_dax_mapping *dmap;
+
+	spin_lock(&fcd->lock);
+	dmap = list_first_entry_or_null(&fcd->free_ranges,
+					struct fuse_dax_mapping, list);
+	if (dmap) {
+		list_del_init(&dmap->list);
+		WARN_ON(fcd->nr_free_ranges <= 0);
+		fcd->nr_free_ranges--;
+	}
+	spin_unlock(&fcd->lock);
+	return dmap;
+}
+
+/* This assumes fcd->lock is held */
+static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+				struct fuse_dax_mapping *dmap)
+{
+	list_add_tail(&dmap->list, &fcd->free_ranges);
+	fcd->nr_free_ranges++;
+}
+
+static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
+				struct fuse_dax_mapping *dmap)
+{
+	/* Return fuse_dax_mapping to free list */
+	spin_lock(&fcd->lock);
+	__dmap_add_to_free_pool(fcd, dmap);
+	spin_unlock(&fcd->lock);
+}
+
+static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
+				  struct fuse_dax_mapping *dmap, bool writable,
+				  bool upgrade)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_conn_dax *fcd = fc->dax;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_setupmapping_in inarg;
+	loff_t offset = start_idx << FUSE_DAX_SHIFT;
+	FUSE_ARGS(args);
+	ssize_t err;
+
+	WARN_ON(fcd->nr_free_ranges < 0);
+
+	/* Ask fuse daemon to setup mapping */
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.foffset = offset;
+	inarg.fh = -1;
+	inarg.moffset = dmap->window_offset;
+	inarg.len = FUSE_DAX_SZ;
+	inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
+	if (writable)
+		inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
+	args.opcode = FUSE_SETUPMAPPING;
+	args.nodeid = fi->nodeid;
+	args.in_numargs = 1;
+	args.in_args[0].size = sizeof(inarg);
+	args.in_args[0].value = &inarg;
+	err = fuse_simple_request(fc, &args);
+	if (err < 0)
+		return err;
+	dmap->writable = writable;
+	if (!upgrade) {
+		dmap->itn.start = dmap->itn.last = start_idx;
+		/* Protected by fi->dax->sem */
+		interval_tree_insert(&dmap->itn, &fi->dax->tree);
+		fi->dax->nr++;
+	}
+	return 0;
+}
+
+static int fuse_send_removemapping(struct inode *inode,
+				   struct fuse_removemapping_in *inargp,
+				   struct fuse_removemapping_one *remove_one)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	FUSE_ARGS(args);
+
+	args.opcode = FUSE_REMOVEMAPPING;
+	args.nodeid = fi->nodeid;
+	args.in_numargs = 2;
+	args.in_args[0].size = sizeof(*inargp);
+	args.in_args[0].value = inargp;
+	args.in_args[1].size = inargp->count * sizeof(*remove_one);
+	args.in_args[1].value = remove_one;
+	return fuse_simple_request(fc, &args);
+}
+
+static int dmap_removemapping_list(struct inode *inode, unsigned int num,
+				   struct list_head *to_remove)
+{
+	struct fuse_removemapping_one *remove_one, *ptr;
+	struct fuse_removemapping_in inarg;
+	struct fuse_dax_mapping *dmap;
+	int ret, i = 0, nr_alloc;
+
+	nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
+	remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
+	if (!remove_one)
+		return -ENOMEM;
+
+	ptr = remove_one;
+	list_for_each_entry(dmap, to_remove, list) {
+		ptr->moffset = dmap->window_offset;
+		ptr->len = dmap->length;
+		ptr++;
+		i++;
+		num--;
+		if (i >= nr_alloc || num == 0) {
+			memset(&inarg, 0, sizeof(inarg));
+			inarg.count = i;
+			ret = fuse_send_removemapping(inode, &inarg,
+						      remove_one);
+			if (ret)
+				goto out;
+			ptr = remove_one;
+			i = 0;
+		}
+	}
+out:
+	kfree(remove_one);
+	return ret;
+}
+
+/*
+ * Cleanup dmap entry and add back to free list. This should be called with
+ * fcd->lock held.
+ */
+static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
+					    struct fuse_dax_mapping *dmap)
+{
+	pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
+		 dmap->itn.start, dmap->itn.last, dmap->window_offset,
+		 dmap->length);
+	dmap->itn.start = dmap->itn.last = 0;
+	__dmap_add_to_free_pool(fcd, dmap);
+}
+
+/*
+ * Free inode dmap entries whose range falls inside [start, end].
+ * Does not take any locks. At this point of time it should only be
+ * called from evict_inode() path where we know all dmap entries can be
+ * reclaimed.
+ */
+static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
+				     struct inode *inode,
+				     loff_t start, loff_t end)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap, *n;
+	int err, num = 0;
+	LIST_HEAD(to_remove);
+	unsigned long start_idx = start >> FUSE_DAX_SHIFT;
+	unsigned long end_idx = end >> FUSE_DAX_SHIFT;
+	struct interval_tree_node *node;
+
+	while (1) {
+		node = interval_tree_iter_first(&fi->dax->tree, start_idx,
+						end_idx);
+		if (!node)
+			break;
+		dmap = node_to_dmap(node);
+		interval_tree_remove(&dmap->itn, &fi->dax->tree);
+		num++;
+		list_add(&dmap->list, &to_remove);
+	}
+
+	/* Nothing to remove */
+	if (list_empty(&to_remove))
+		return;
+
+	WARN_ON(fi->dax->nr < num);
+	fi->dax->nr -= num;
+	err = dmap_removemapping_list(inode, num, &to_remove);
+	if (err && err != -ENOTCONN) {
+		pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
+			start, end);
+	}
+	spin_lock(&fcd->lock);
+	list_for_each_entry_safe(dmap, n, &to_remove, list) {
+		list_del_init(&dmap->list);
+		dmap_reinit_add_to_free_pool(fcd, dmap);
+	}
+	spin_unlock(&fcd->lock);
+}
+
+/*
+ * It is called from evict_inode() and by that time inode is going away. So
+ * this function does not take any locks like fi->dax->sem for traversing
+ * that fuse inode interval tree. If that lock is taken then lock validator
+ * complains of deadlock situation w.r.t fs_reclaim lock.
+ */
+void fuse_dax_inode_cleanup(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	/*
+	 * fuse_evict_inode() has already called truncate_inode_pages_final()
+	 * before we arrive here. So we should not have to worry about any
+	 * pages/exception entries still associated with inode.
+	 */
+	inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
+	WARN_ON(fi->dax->nr);
+}
+
+static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
+{
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->length = length;
+	iomap->type = IOMAP_HOLE;
+}
+
+static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
+			    struct iomap *iomap, struct fuse_dax_mapping *dmap,
+			    unsigned int flags)
+{
+	loff_t offset, len;
+	loff_t i_size = i_size_read(inode);
+
+	offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
+	len = min(length, dmap->length - offset);
+
+	/* If length is beyond end of file, truncate further */
+	if (pos + len > i_size)
+		len = i_size - pos;
+
+	if (len > 0) {
+		iomap->addr = dmap->window_offset + offset;
+		iomap->length = len;
+		if (flags & IOMAP_FAULT)
+			iomap->length = ALIGN(len, PAGE_SIZE);
+		iomap->type = IOMAP_MAPPED;
+	} else {
+		/* Mapping beyond end of file is hole */
+		fuse_fill_iomap_hole(iomap, length);
+	}
+}
+
+static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
+				      loff_t length, unsigned int flags,
+				      struct iomap *iomap)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_conn_dax *fcd = fc->dax;
+	struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
+	int ret;
+	bool writable = flags & IOMAP_WRITE;
+	unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+	struct interval_tree_node *node;
+
+	alloc_dmap = alloc_dax_mapping(fcd);
+	if (!alloc_dmap)
+		return -EIO;
+
+	/*
+	 * Take write lock so that only one caller can try to setup mapping
+	 * and other waits.
+	 */
+	down_write(&fi->dax->sem);
+	/*
+	 * We dropped lock. Check again if somebody else setup
+	 * mapping already.
+	 */
+	node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+	if (node) {
+		dmap = node_to_dmap(node);
+		fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+		dmap_add_to_free_pool(fcd, alloc_dmap);
+		up_write(&fi->dax->sem);
+		return 0;
+	}
+
+	/* Setup one mapping */
+	ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
+				     writable, false);
+	if (ret < 0) {
+		dmap_add_to_free_pool(fcd, alloc_dmap);
+		up_write(&fi->dax->sem);
+		return ret;
+	}
+	fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
+	up_write(&fi->dax->sem);
+	return 0;
+}
+
+static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
+				    loff_t length, unsigned int flags,
+				    struct iomap *iomap)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_dax_mapping *dmap;
+	int ret;
+	unsigned long idx = pos >> FUSE_DAX_SHIFT;
+	struct interval_tree_node *node;
+
+	/*
+	 * Take exclusive lock so that only one caller can try to setup
+	 * mapping and others wait.
+	 */
+	down_write(&fi->dax->sem);
+	node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
+
+	/* We are holding either inode lock or i_mmap_sem, and that should
+	 * ensure that dmap can't reclaimed or truncated and it should still
+	 * be there in tree despite the fact we dropped and re-acquired the
+	 * lock.
+	 */
+	ret = -EIO;
+	if (WARN_ON(!node))
+		goto out_err;
+
+	dmap = node_to_dmap(node);
+
+	/* Maybe another thread already upgraded mapping while we were not
+	 * holding lock.
+	 */
+	if (dmap->writable) {
+		ret = 0;
+		goto out_fill_iomap;
+	}
+
+	ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
+				     true);
+	if (ret < 0)
+		goto out_err;
+out_fill_iomap:
+	fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+out_err:
+	up_write(&fi->dax->sem);
+	return ret;
+}
+
+/* This is just for DAX and the mapping is ephemeral, do not use it for other
+ * purposes since there is no block device with a permanent mapping.
+ */
+static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+			    unsigned int flags, struct iomap *iomap,
+			    struct iomap *srcmap)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_dax_mapping *dmap;
+	bool writable = flags & IOMAP_WRITE;
+	unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
+	struct interval_tree_node *node;
+
+	/* We don't support FIEMAP */
+	if (WARN_ON(flags & IOMAP_REPORT))
+		return -EIO;
+
+	iomap->offset = pos;
+	iomap->flags = 0;
+	iomap->bdev = NULL;
+	iomap->dax_dev = fc->dax->dev;
+
+	/*
+	 * Both read/write and mmap path can race here. So we need something
+	 * to make sure if we are setting up mapping, then other path waits
+	 *
+	 * For now, use a semaphore for this. It probably needs to be
+	 * optimized later.
+	 */
+	down_read(&fi->dax->sem);
+	node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
+	if (node) {
+		dmap = node_to_dmap(node);
+		if (writable && !dmap->writable) {
+			/* Upgrade read-only mapping to read-write. This will
+			 * require exclusive fi->dax->sem lock as we don't want
+			 * two threads to be trying to this simultaneously
+			 * for same dmap. So drop shared lock and acquire
+			 * exclusive lock.
+			 */
+			up_read(&fi->dax->sem);
+			pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
+				 __func__, pos, length);
+			return fuse_upgrade_dax_mapping(inode, pos, length,
+							flags, iomap);
+		} else {
+			fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
+			up_read(&fi->dax->sem);
+			return 0;
+		}
+	} else {
+		up_read(&fi->dax->sem);
+		pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
+				__func__, pos, length);
+		if (pos >= i_size_read(inode))
+			goto iomap_hole;
+
+		return fuse_setup_new_dax_mapping(inode, pos, length, flags,
+						  iomap);
+	}
+
+	/*
+	 * If read beyond end of file happnes, fs code seems to return
+	 * it as hole
+	 */
+iomap_hole:
+	fuse_fill_iomap_hole(iomap, length);
+	pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
+		 __func__, pos, length, iomap->length);
+	return 0;
+}
+
+static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+			  ssize_t written, unsigned int flags,
+			  struct iomap *iomap)
+{
+	/* DAX writes beyond end-of-file aren't handled using iomap, so the
+	 * file size is unchanged and there is nothing to do here.
+	 */
+	return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+	.iomap_begin = fuse_iomap_begin,
+	.iomap_end = fuse_iomap_end,
+};
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock_shared(inode))
+			return -EAGAIN;
+	} else {
+		inode_lock_shared(inode);
+	}
+
+	ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
+	inode_unlock_shared(inode);
+
+	/* TODO file_accessed(iocb->f_filp) */
+	return ret;
+}
+
+static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return (iov_iter_rw(from) == WRITE &&
+		((iocb->ki_pos) >= i_size_read(inode) ||
+		  (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
+}
+
+static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+	ssize_t ret;
+
+	ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+	if (ret < 0)
+		return ret;
+
+	fuse_invalidate_attr(inode);
+	fuse_write_update_size(inode, iocb->ki_pos);
+	return ret;
+}
+
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock(inode))
+			return -EAGAIN;
+	} else {
+		inode_lock(inode);
+	}
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		goto out;
+	/* TODO file_update_time() but we don't want metadata I/O */
+
+	/* Do not use dax for file extending writes as write and on
+	 * disk i_size increase are not atomic otherwise.
+	 */
+	if (file_extending_write(iocb, from))
+		ret = fuse_dax_direct_write(iocb, from);
+	else
+		ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
+
+out:
+	inode_unlock(inode);
+
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
+}
+
 static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
 {
 	struct fuse_dax_mapping *range, *temp;
@@ -116,6 +653,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
 	if (!fcd)
 		return -ENOMEM;
 
+	spin_lock_init(&fcd->lock);
 	fcd->dev = dax_dev;
 	err = fuse_dax_mem_range_init(fcd);
 	if (err) {
@@ -127,6 +665,33 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev)
 	return 0;
 }
 
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+	fi->dax = NULL;
+	if (fc->dax) {
+		fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
+		if (!fi->dax)
+			return false;
+
+		init_rwsem(&fi->dax->sem);
+		fi->dax->tree = RB_ROOT_CACHED;
+	}
+
+	return true;
+}
+
+void fuse_dax_inode_init(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (!fc->dax)
+		return;
+
+	inode->i_flags |= S_DAX;
+}
+
 bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
 {
 	if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6611ef3269a8..6c586bc97b64 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1539,10 +1539,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(file);
 
-	if (is_bad_inode(file_inode(file)))
+	if (is_bad_inode(inode))
 		return -EIO;
 
+	if (FUSE_IS_DAX(inode))
+		return fuse_dax_read_iter(iocb, to);
+
 	if (!(ff->open_flags & FOPEN_DIRECT_IO))
 		return fuse_cache_read_iter(iocb, to);
 	else
@@ -1553,10 +1557,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(file);
 
-	if (is_bad_inode(file_inode(file)))
+	if (is_bad_inode(inode))
 		return -EIO;
 
+	if (FUSE_IS_DAX(inode))
+		return fuse_dax_write_iter(iocb, from);
+
 	if (!(ff->open_flags & FOPEN_DIRECT_IO))
 		return fuse_cache_write_iter(iocb, from);
 	else
@@ -3440,4 +3448,7 @@ void fuse_init_file_inode(struct inode *inode)
 	fi->writectr = 0;
 	init_waitqueue_head(&fi->page_waitq);
 	fi->writepages = RB_ROOT;
+
+	if (IS_ENABLED(CONFIG_FUSE_DAX))
+		fuse_dax_inode_init(inode);
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2f3f04aa64c7..2d2bdd596194 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -148,6 +148,13 @@ struct fuse_inode {
 
 	/** Lock to protect write related fields */
 	spinlock_t lock;
+
+#ifdef CONFIG_FUSE_DAX
+	/*
+	 * Dax specific inode data
+	 */
+	struct fuse_inode_dax *dax;
+#endif
 };
 
 /** FUSE inode state bits */
@@ -1104,8 +1111,16 @@ void fuse_free_conn(struct fuse_conn *fc);
 
 /* dax.c */
 
+#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+
+ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
+ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
+int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma);
 int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev);
 void fuse_dax_conn_free(struct fuse_conn *fc);
+bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi);
+void fuse_dax_inode_init(struct inode *inode);
+void fuse_dax_inode_cleanup(struct inode *inode);
 bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment);
 
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 67e99cee5a4f..cab4239bd78a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -87,12 +87,19 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	mutex_init(&fi->mutex);
 	spin_lock_init(&fi->lock);
 	fi->forget = fuse_alloc_forget();
-	if (!fi->forget) {
-		kmem_cache_free(fuse_inode_cachep, fi);
-		return NULL;
-	}
+	if (!fi->forget)
+		goto out_free;
+
+	if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
+		goto out_free_forget;
 
 	return &fi->inode;
+
+out_free_forget:
+	kfree(fi->forget);
+out_free:
+	kmem_cache_free(fuse_inode_cachep, fi);
+	return NULL;
 }
 
 static void fuse_free_inode(struct inode *inode)
@@ -101,6 +108,9 @@ static void fuse_free_inode(struct inode *inode)
 
 	mutex_destroy(&fi->mutex);
 	kfree(fi->forget);
+#ifdef CONFIG_FUSE_DAX
+	kfree(fi->dax);
+#endif
 	kmem_cache_free(fuse_inode_cachep, fi);
 }
 
@@ -112,6 +122,9 @@ static void fuse_evict_inode(struct inode *inode)
 	clear_inode(inode);
 	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
+
+		if (FUSE_IS_DAX(inode))
+			fuse_dax_inode_cleanup(inode);
 		fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
 		fi->forget = NULL;
 	}
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 60a7bfc787ce..8899e4862309 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -895,6 +895,7 @@ struct fuse_copy_file_range_in {
 };
 
 #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0)
+#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1)
 struct fuse_setupmapping_in {
 	/* An already open handle */
 	uint64_t	fh;
-- 
cgit v1.2.3


From c6ff213fe5b8696c9539a1b34ff03de9306dfff9 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 8 Sep 2020 18:01:48 +0200
Subject: fuse: add submount support to <uapi/linux/fuse.h>

- Add fuse_attr.flags

- Add FUSE_ATTR_SUBMOUNT

  This is a flag for fuse_attr.flags that indicates that the given entry
  resides on a different filesystem than the parent, and as such should
  have a different st_dev.

- Add FUSE_SUBMOUNTS

  The client sets this flag if it supports automounting directories.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/uapi/linux/fuse.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 8899e4862309..7233502ea991 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -172,6 +172,9 @@
  *  - add FUSE_WRITE_KILL_PRIV flag
  *  - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING
  *  - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag
+ *
+ *  7.32
+ *  - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS
  */
 
 #ifndef _LINUX_FUSE_H
@@ -207,7 +210,7 @@
 #define FUSE_KERNEL_VERSION 7
 
 /** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 31
+#define FUSE_KERNEL_MINOR_VERSION 32
 
 /** The node ID of the root inode */
 #define FUSE_ROOT_ID 1
@@ -231,7 +234,7 @@ struct fuse_attr {
 	uint32_t	gid;
 	uint32_t	rdev;
 	uint32_t	blksize;
-	uint32_t	padding;
+	uint32_t	flags;
 };
 
 struct fuse_kstatfs {
@@ -316,6 +319,7 @@ struct fuse_file_lock {
  * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for
  *		       foffset and moffset fields in struct
  *		       fuse_setupmapping_out and fuse_removemapping_one.
+ * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -344,6 +348,7 @@ struct fuse_file_lock {
 #define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
 #define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
 #define FUSE_MAP_ALIGNMENT	(1 << 26)
+#define FUSE_SUBMOUNTS		(1 << 27)
 
 /**
  * CUSE INIT request/reply flags
@@ -419,6 +424,13 @@ struct fuse_file_lock {
  */
 #define FUSE_FSYNC_FDATASYNC	(1 << 0)
 
+/**
+ * fuse_attr flags
+ *
+ * FUSE_ATTR_SUBMOUNT: Object is a submount root
+ */
+#define FUSE_ATTR_SUBMOUNT      (1 << 0)
+
 enum fuse_opcode {
 	FUSE_LOOKUP		= 1,
 	FUSE_FORGET		= 2,  /* no reply */
-- 
cgit v1.2.3