From 7eac52648a4c24ad23a05f62db97867c92a5747b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 Feb 2020 19:11:12 -0500 Subject: SUNRPC: Add a flag to avoid reference counts on credentials Add a flag to signal to the RPC layer that the credential is already pinned for the duration of the RPC call. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index a6ef35184ef1..df696efdd675 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -132,6 +132,7 @@ struct rpc_task_setup { #define RPC_TASK_TIMEOUT 0x1000 /* fail with ETIMEDOUT on timeout */ #define RPC_TASK_NOCONNECT 0x2000 /* return ENOTCONN if not connected */ #define RPC_TASK_NO_RETRANS_TIMEOUT 0x4000 /* wait forever for a reply */ +#define RPC_TASK_CRED_NOREF 0x8000 /* No refcount on the credential */ #define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC) #define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER) -- cgit v1.2.3 From 8d6bda7f23a9b3ef1d7e386f01924c37f18fe771 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2020 11:21:12 -0400 Subject: SUNRPC: Remove xdr_buf_read_mic() Clean up: this function is no longer used. Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 1 - net/sunrpc/xdr.c | 55 ---------------------------------------------- 2 files changed, 56 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index b41f34977995..8a6dd5bd6748 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -184,7 +184,6 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); -extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e5497dc2475b..15b58c5144f9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/** - * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf - * @buf: pointer to buffer containing a mic - * @mic: on success, returns the address of the mic - * @offset: the offset in buf where mic may be found - * - * This function may modify the xdr buf if the mic is found to be straddling - * a boundary between head, pages, and tail. On success the mic can be read - * from the address returned. There is no need to free the mic. - * - * Return: Success returns 0, otherwise an integer error. - */ -int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) -{ - struct xdr_buf subbuf; - unsigned int boundary; - - if (xdr_decode_word(buf, offset, &mic->len)) - return -EFAULT; - offset += 4; - - /* Is the mic partially in the head? */ - boundary = buf->head[0].iov_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shift_buf(buf, boundary - offset); - - /* Is the mic partially in the pages? */ - boundary += buf->page_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shrink_pagelen(buf, boundary - offset); - - if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) - return -EFAULT; - - /* Is the mic contained entirely in the head? */ - mic->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == mic->len) - return 0; - /* ..or is the mic contained entirely in the tail? */ - mic->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == mic->len) - return 0; - - /* Find a contiguous area in @buf to hold all of @mic */ - if (mic->len > buf->buflen - buf->len) - return -ENOMEM; - if (buf->tail[0].iov_len != 0) - mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; - else - mic->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); - return 0; -} -EXPORT_SYMBOL_GPL(xdr_buf_read_mic); - /* Returns 0 on success, or else a negative error code. */ static int xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, -- cgit v1.2.3 From d7242c4641fba521a1ea9dbccb11a40cf38cd912 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Mar 2020 17:22:47 -0400 Subject: pNFS: Add a helper to allocate the array of buckets Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 3 +++ fs/nfs/pnfs_nfs.c | 31 +++++++++++++++++++++++++++++++ include/linux/nfs_xdr.h | 15 ++++++++++++--- 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 7bfb6970134a..f6b1099aa151 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -366,6 +366,9 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); /* pnfs_nfs.c */ +struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags); +void pnfs_free_commit_array(struct pnfs_commit_array *p); + void pnfs_generic_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo); void pnfs_generic_commit_release(void *calldata); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 3d0942541618..c8518ce3a4ef 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -87,6 +87,37 @@ out: } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); +struct pnfs_commit_array * +pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags) +{ + struct pnfs_commit_array *p; + struct pnfs_commit_bucket *b; + + p = kmalloc(struct_size(p, buckets, n), gfp_flags); + if (!p) + return NULL; + p->nbuckets = n; + INIT_LIST_HEAD(&p->cinfo_list); + INIT_LIST_HEAD(&p->lseg_list); + p->lseg = NULL; + for (b = &p->buckets[0]; n != 0; b++, n--) { + INIT_LIST_HEAD(&b->written); + INIT_LIST_HEAD(&b->committing); + b->wlseg = NULL; + b->clseg = NULL; + b->direct_verf.committed = NFS_INVALID_STABLE_HOW; + } + return p; +} +EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array); + +void +pnfs_free_commit_array(struct pnfs_commit_array *p) +{ + kfree_rcu(p, rcu); +} +EXPORT_SYMBOL_GPL(pnfs_free_commit_array); + static int pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 94c77ed55ce1..e91c917c9c1c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1270,10 +1270,19 @@ struct pnfs_commit_bucket { struct nfs_writeverf direct_verf; }; +struct pnfs_commit_array { + struct list_head cinfo_list; + struct list_head lseg_list; + struct pnfs_layout_segment *lseg; + struct rcu_head rcu; + unsigned int nbuckets; + struct pnfs_commit_bucket buckets[]; +}; + struct pnfs_ds_commit_info { - int nwritten; - int ncommitting; - int nbuckets; + unsigned int nwritten; + unsigned int ncommitting; + unsigned int nbuckets; struct pnfs_commit_bucket *buckets; }; -- cgit v1.2.3 From 62a89501a3bde310ba339cc90bb05879528d84a0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 12 Feb 2020 11:12:35 -0500 Subject: xprtrdma: Enhance MR-related trace points Two changes: - Show the number of SG entries that were mapped. This helps debug DMA-related problems. - Record the MR's resource ID instead of its memory address. This groups each MR with its associated rdma-tool output, and reduces needless exposure of memory addresses. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 56 ++++++++++++++++++++++-------------------- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 2 files changed, 31 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c0e4c93324f5..87f4461ab108 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -228,20 +228,20 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done, TP_ARGS(wc, frwr), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(unsigned int, status) __field(unsigned int, vendor_err) ), TP_fast_assign( - __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr); + __entry->mr_id = frwr->fr_mr->res.id; __entry->status = wc->status; __entry->vendor_err = __entry->status ? wc->vendor_err : 0; ), TP_printk( - "mr=%p: %s (%u/0x%x)", - __entry->mr, rdma_show_wc_status(__entry->status), + "mr.id=%u: %s (%u/0x%x)", + __entry->mr_id, rdma_show_wc_status(__entry->status), __entry->status, __entry->vendor_err ) ); @@ -274,7 +274,8 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, TP_ARGS(mr), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) + __field(int, nents) __field(u32, handle) __field(u32, length) __field(u64, offset) @@ -282,15 +283,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; __entry->dir = mr->mr_dir; ), - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)", - __entry->mr, __entry->length, + TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)", + __entry->mr_id, __entry->nents, __entry->length, (unsigned long long)__entry->offset, __entry->handle, xprtrdma_show_direction(__entry->dir) ) @@ -920,17 +922,17 @@ TRACE_EVENT(xprtrdma_frwr_alloc, TP_ARGS(mr, rc), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(int, rc) ), TP_fast_assign( - __entry->mr = mr; - __entry->rc = rc; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->rc = rc; ), - TP_printk("mr=%p: rc=%d", - __entry->mr, __entry->rc + TP_printk("mr.id=%u: rc=%d", + __entry->mr_id, __entry->rc ) ); @@ -943,7 +945,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg, TP_ARGS(mr, rc), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) + __field(int, nents) __field(u32, handle) __field(u32, length) __field(u64, offset) @@ -952,7 +955,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg, ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; @@ -960,8 +964,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg, __entry->rc = rc; ), - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s): rc=%d", - __entry->mr, __entry->length, + TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s): rc=%d", + __entry->mr_id, __entry->nents, __entry->length, (unsigned long long)__entry->offset, __entry->handle, xprtrdma_show_direction(__entry->dir), __entry->rc @@ -977,21 +981,21 @@ TRACE_EVENT(xprtrdma_frwr_sgerr, TP_ARGS(mr, sg_nents), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(u64, addr) __field(u32, dir) __field(int, nents) ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->nents = sg_nents; ), - TP_printk("mr=%p dma addr=0x%llx (%s) sg_nents=%d", - __entry->mr, __entry->addr, + TP_printk("mr.id=%u DMA addr=0x%llx (%s) sg_nents=%d", + __entry->mr_id, __entry->addr, xprtrdma_show_direction(__entry->dir), __entry->nents ) @@ -1006,7 +1010,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, TP_ARGS(mr, num_mapped), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(u64, addr) __field(u32, dir) __field(int, num_mapped) @@ -1014,15 +1018,15 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->num_mapped = num_mapped; __entry->nents = mr->mr_nents; ), - TP_printk("mr=%p dma addr=0x%llx (%s) nents=%d of %d", - __entry->mr, __entry->addr, + TP_printk("mr.id=%u DMA addr=0x%llx (%s) nents=%d of %d", + __entry->mr_id, __entry->addr, xprtrdma_show_direction(__entry->dir), __entry->num_mapped, __entry->nents ) @@ -1031,7 +1035,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); DEFINE_MR_EVENT(unmap); -DEFINE_MR_EVENT(remoteinv); +DEFINE_MR_EVENT(reminv); DEFINE_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 125297c9aa3e..0dc799553a08 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -419,7 +419,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_remoteinv(mr); + trace_xprtrdma_mr_reminv(mr); rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } -- cgit v1.2.3 From 81fe0c57f4e136375f3bcda74af413f82a34a1bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:38 -0500 Subject: xprtrdma: Invoke rpcrdma_ia_open in the connect worker Move rdma_cm_id creation into rpcrdma_ep_create() so that it is now responsible for allocating all per-connection hardware resources. With this clean-up, all three arms of the switch statement in rpcrdma_ep_connect are exactly the same now, thus the switch can be removed. Because device removal behaves a little differently than disconnection, there is a little more work to be done before rpcrdma_ep_destroy() can release the connection's rdma_cm_id. So it is not quite symmetrical with rpcrdma_ep_create() yet. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 1 - net/sunrpc/xprtrdma/transport.c | 7 -- net/sunrpc/xprtrdma/verbs.c | 153 ++++++---------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 - 4 files changed, 20 insertions(+), 143 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 87f4461ab108..ba37c47b51e8 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -415,7 +415,6 @@ DEFINE_CONN_EVENT(disconnect); DEFINE_RXPRT_EVENT(xprtrdma_create); DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); DEFINE_RXPRT_EVENT(xprtrdma_remove); -DEFINE_RXPRT_EVENT(xprtrdma_reinsert); DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); DEFINE_RXPRT_EVENT(xprtrdma_op_close); DEFINE_RXPRT_EVENT(xprtrdma_op_setport); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6349e6c98b57..745dfd149637 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -286,7 +286,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) rpcrdma_xprt_disconnect(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); xprt_free(xprt); @@ -347,10 +346,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); - if (rc) - goto out1; - rc = rpcrdma_buffer_create(new_xprt); if (rc) goto out2; @@ -372,8 +367,6 @@ out4: rpcrdma_buffer_destroy(&new_xprt->rx_buf); rc = -ENODEV; out2: - rpcrdma_ia_close(&new_xprt->rx_ia); -out1: trace_xprtrdma_op_destroy(new_xprt); xprt_rdma_free_addresses(xprt); xprt_free(xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 36fe7baea014..3df20f355579 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -345,31 +345,6 @@ out: * Exported functions. */ -/** - * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: transport with IA to (re)initialize - * - * Returns 0 on success, negative errno if an appropriate - * Interface Adapter could not be found and opened. - */ -int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt) -{ - struct rpcrdma_ia *ia = &xprt->rx_ia; - int rc; - - ia->ri_id = rpcrdma_create_id(xprt, ia); - if (IS_ERR(ia->ri_id)) { - rc = PTR_ERR(ia->ri_id); - goto out_err; - } - return 0; - -out_err: - rpcrdma_ia_close(ia); - return rc; -} - /** * rpcrdma_ia_remove - Handle device driver unload * @ia: interface adapter being removed @@ -401,34 +376,26 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) trace_xprtrdma_remove(r_xprt); } -/** - * rpcrdma_ia_close - Clean up/close an IA. - * @ia: interface adapter to close - * - */ -void -rpcrdma_ia_close(struct rpcrdma_ia *ia) -{ - if (ia->ri_id && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; -} - -static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, - struct rdma_cm_id *id) +static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + struct rdma_cm_id *id; int rc; + id = rpcrdma_create_id(r_xprt, ia); + if (IS_ERR(id)) + return PTR_ERR(id); + ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; ep->rep_inline_send = xprt_rdma_max_inline_write; ep->rep_inline_recv = xprt_rdma_max_inline_read; rc = frwr_query_device(r_xprt, id->device); if (rc) - return rc; + goto out_destroy; + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); ep->rep_attr.event_handler = rpcrdma_qp_event_handler; @@ -507,10 +474,12 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); if (rc) goto out_destroy; + ia->ri_id = id; return 0; out_destroy: rpcrdma_ep_destroy(r_xprt); + rdma_destroy_id(id); return rc; } @@ -536,79 +505,8 @@ static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) ia->ri_pd = NULL; } -/* Re-establish a connection after a device removal event. - * Unlike a normal reconnection, a fresh PD and a new set - * of MRs and buffers is needed. - */ -static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc, err; - - trace_xprtrdma_reinsert(r_xprt); - - rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt)) - goto out1; - - rc = -ENETUNREACH; - err = rpcrdma_ep_create(r_xprt, ia->ri_id); - if (err) - goto out2; - return 0; - -out2: - rpcrdma_ia_close(ia); -out1: - return rc; -} - -static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id, *old; - int err, rc; - - rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia); - if (IS_ERR(id)) - goto out; - - /* As long as the new ID points to the same device as the - * old ID, we can reuse the transport's existing PD and all - * previously allocated MRs. Also, the same device means - * the transport's previous DMA mappings are still valid. - * - * This is a sanity check only. There should be no way these - * point to two different devices here. - */ - old = id; - rc = -ENETUNREACH; - if (ia->ri_id->device != id->device) { - pr_err("rpcrdma: can't reconnect on different device!\n"); - goto out_destroy; - } - - err = rpcrdma_ep_create(r_xprt, id); - if (err) - goto out_destroy; - - /* Atomically replace the transport's ID. */ - rc = 0; - old = ia->ri_id; - ia->ri_id = id; - -out_destroy: - rdma_destroy_id(old); -out: - return rc; -} - -/** - * rpcrdma_xprt_connect - Connect an unconnected transport - * @r_xprt: controlling transport instance - * - * Returns 0 on success or a negative errno. +/* + * Connect unconnected endpoint. */ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { @@ -618,25 +516,10 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) int rc; retry: - switch (ep->rep_connected) { - case 0: - rc = -ENETUNREACH; - if (rpcrdma_ep_create(r_xprt, ia->ri_id)) - goto out_noupdate; - break; - case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt); - if (rc) - goto out_noupdate; - break; - case 1: - rpcrdma_xprt_disconnect(r_xprt); - /* fall through */ - default: - rc = rpcrdma_ep_reconnect(r_xprt); - if (rc) - goto out; - } + rpcrdma_xprt_disconnect(r_xprt); + rc = rpcrdma_ep_create(r_xprt); + if (rc) + goto out_noupdate; ep->rep_connected = 0; xprt_clear_connected(xprt); @@ -712,6 +595,10 @@ out: rpcrdma_sendctxs_destroy(r_xprt); rpcrdma_ep_destroy(r_xprt); + + if (ia->ri_id) + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9ead06b1d8a4..8be1b70b71a2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -457,9 +457,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); -void rpcrdma_ia_close(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c -- cgit v1.2.3 From d6ccebf956338ea015d7d54c4a4c9c17605707cb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:49 -0500 Subject: xprtrdma: Disconnect on flushed completion Completion errors after a disconnect often occur much sooner than a CM_DISCONNECT event. Use this to try to detect connection loss more quickly. Note that other kernel ULPs do take care to disconnect explicitly when a WR is flushed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 3 ++- net/sunrpc/xprtrdma/frwr_ops.c | 24 ++++++++++++++++-------- net/sunrpc/xprtrdma/verbs.c | 37 ++++++++++++++++++++++++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 47 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ba37c47b51e8..cfbe28ad2614 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -109,7 +109,7 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d", + TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d", __get_str(addr), __get_str(port), __entry->r_xprt, __entry->rc, __entry->connect_status ) @@ -411,6 +411,7 @@ TRACE_EVENT(xprtrdma_inline_thresh, DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); +DEFINE_CONN_EVENT(flush_dct); DEFINE_RXPRT_EVENT(xprtrdma_create); DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index a1b5c8024cca..b482fac7be89 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -358,8 +358,8 @@ out_mapmr_err: /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed FastReg WR * */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) @@ -371,6 +371,8 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, frwr); /* The MR will get recycled when the associated req is retransmitted */ + + rpcrdma_flush_disconnect(cq, wc); } /** @@ -441,8 +443,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) /** * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) @@ -455,12 +457,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li(wc, frwr); __frwr_release_mr(wc, mr); + + rpcrdma_flush_disconnect(cq, wc); } /** * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * * Awaken anyone waiting for an MR to finish being fenced. */ @@ -475,6 +479,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_li_wake(wc, frwr); __frwr_release_mr(wc, mr); complete(&frwr->fr_linv_done); + + rpcrdma_flush_disconnect(cq, wc); } /** @@ -562,8 +568,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /** * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -581,6 +587,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) /* Ensure @rep is generated before __frwr_release_mr */ smp_rmb(); rpcrdma_complete_rqst(rep); + + rpcrdma_flush_disconnect(cq, wc); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a7f46bbbf017..dfe680e3234a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -128,14 +128,32 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) trace_xprtrdma_qp_event(r_xprt, event); } +/** + * rpcrdma_flush_disconnect - Disconnect on flushed completion + * @cq: completion queue + * @wc: work completion entry + * + * Must be called in process context. + */ +void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rpcrdma_xprt *r_xprt = cq->cq_context; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + if (wc->status != IB_WC_SUCCESS && r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -ECONNABORTED; + trace_xprtrdma_flush_dct(r_xprt, wc->status); + xprt_force_disconnect(xprt); + } +} + /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue - * @wc: completed WR + * @wc: WCE for a completed Send WR * */ -static void -rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_sendctx *sc = @@ -144,21 +162,21 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); + rpcrdma_flush_disconnect(cq, wc); } /** * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed Receive WR * */ -static void -rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); @@ -179,6 +197,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: + rpcrdma_flush_disconnect(cq, wc); rpcrdma_rep_destroy(rep); } @@ -395,7 +414,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) goto out_destroy; } - ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, NULL, + ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, ep->rep_attr.cap.max_recv_wr, IB_POLL_WORKQUEUE); if (IS_ERR(ep->rep_attr.recv_cq)) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index d2a0f125f7a8..8a3ac9d7ee81 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -452,6 +452,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Endpoint calls - xprtrdma/verbs.c */ +void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -- cgit v1.2.3 From 93aa8e0a9de80e1df2be17158a3469285e572b39 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:54 -0500 Subject: xprtrdma: Merge struct rpcrdma_ia into struct rpcrdma_ep I eventually want to allocate rpcrdma_ep separately from struct rpcrdma_xprt so that on occasion there can be more than one ep per xprt. The new struct rpcrdma_ep will contain all the fields currently in rpcrdma_ia and in rpcrdma_ep. This is all the device and CM settings for the connection, in addition to per-connection settings negotiated with the remote. Take this opportunity to rename the existing ep fields from rep_* to re_* to disambiguate these from struct rpcrdma_rep. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 12 +- net/sunrpc/xprtrdma/backchannel.c | 4 +- net/sunrpc/xprtrdma/frwr_ops.c | 108 +++++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 31 ++--- net/sunrpc/xprtrdma/transport.c | 9 +- net/sunrpc/xprtrdma/verbs.c | 283 +++++++++++++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 60 ++++---- 7 files changed, 246 insertions(+), 261 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index cfbe28ad2614..843269f0e291 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -104,7 +104,7 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_fast_assign( __entry->r_xprt = r_xprt; __entry->rc = rc; - __entry->connect_status = r_xprt->rx_ep.rep_connected; + __entry->connect_status = r_xprt->rx_ep.re_connect_status; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), @@ -394,10 +394,10 @@ TRACE_EVENT(xprtrdma_inline_thresh, const struct rpcrdma_ep *ep = &r_xprt->rx_ep; __entry->r_xprt = r_xprt; - __entry->inline_send = ep->rep_inline_send; - __entry->inline_recv = ep->rep_inline_recv; - __entry->max_send = ep->rep_max_inline_send; - __entry->max_recv = ep->rep_max_inline_recv; + __entry->inline_send = ep->re_inline_send; + __entry->inline_recv = ep->re_inline_recv; + __entry->max_send = ep->re_max_inline_send; + __entry->max_recv = ep->re_max_inline_recv; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), @@ -803,7 +803,7 @@ TRACE_EVENT(xprtrdma_post_recvs, __entry->r_xprt = r_xprt; __entry->count = count; __entry->status = status; - __entry->posted = r_xprt->rx_ep.rep_receive_count; + __entry->posted = r_xprt->rx_ep.re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 4b43910a6ed2..4b20102cf060 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -47,7 +47,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) struct rpcrdma_ep *ep = &r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); + maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -190,7 +190,7 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); + size = min_t(size_t, r_xprt->rx_ep.re_inline_recv, PAGE_SIZE); req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index b482fac7be89..19bf422f010b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -115,13 +115,13 @@ void frwr_reset(struct rpcrdma_req *req) */ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; int rc; - frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; @@ -151,29 +151,24 @@ out_list_err: /** * frwr_query_device - Prepare a transport for use with FRWR - * @r_xprt: controlling transport instance + * @ep: endpoint to fill in * @device: RDMA device to query * * On success, sets: - * ep->rep_attr - * ep->rep_max_requests - * ia->ri_max_rdma_segs - * - * And these FRWR-related fields: - * ia->ri_max_frwr_depth - * ia->ri_mrtype + * ep->re_attr + * ep->re_max_requests + * ep->re_max_rdma_segs + * ep->re_max_fr_depth + * ep->re_mrtype * * Return values: * On success, returns zero. * %-EINVAL - the device does not support FRWR memory registration * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA */ -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device) +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) { const struct ib_device_attr *attrs = &device->attrs; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int max_qp_wr, depth, delta; unsigned int max_sge; @@ -190,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); return -ENOMEM; } - ep->rep_attr.cap.max_send_sge = max_sge; - ep->rep_attr.cap.max_recv_sge = 1; + ep->re_attr.cap.max_send_sge = max_sge; + ep->re_attr.cap.max_recv_sge = 1; - ia->ri_mrtype = IB_MR_TYPE_MEM_REG; + ep->re_mrtype = IB_MR_TYPE_MEM_REG; if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) - ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + ep->re_mrtype = IB_MR_TYPE_SG_GAPS; /* Quirk: Some devices advertise a large max_fast_reg_page_list_len * capability, but perform optimally when the MRs are not larger * than a page. */ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_frwr_depth = attrs->max_sge_rd; + ep->re_max_fr_depth = attrs->max_sge_rd; else - ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; - if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) - ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; + if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) + ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; /* Add room for frwr register and invalidate WRs. * 1. FRWR reg WR for head @@ -222,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; + if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; do { depth += 2; /* FRWR reg + invalidate */ - delta -= ia->ri_max_frwr_depth; + delta -= ep->re_max_fr_depth; } while (delta > 0); } @@ -235,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (ep->rep_max_requests > max_qp_wr) - ep->rep_max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; - if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - ep->rep_max_requests = max_qp_wr / depth; - if (!ep->rep_max_requests) + if (ep->re_max_requests > max_qp_wr) + ep->re_max_requests = max_qp_wr; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; + if (ep->re_attr.cap.max_send_wr > max_qp_wr) { + ep->re_max_requests = max_qp_wr / depth; + if (!ep->re_max_requests) return -ENOMEM; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; } - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_rdma_segs = - DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); + ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_attr.cap.max_recv_wr = ep->re_max_requests; + ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ep->re_max_rdma_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); /* Reply chunks require segments for head and tail buffers */ - ia->ri_max_rdma_segs += 2; - if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; + ep->re_max_rdma_segs += 2; + if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) + ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; /* Ensure the underlying device is capable of conveying the * largest r/wsize NFS will ask for. This guarantees that * failing over from one RDMA device to another will not * break NFS I/O. */ - if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS) + if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) return -ENOMEM; return 0; @@ -288,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ia->ri_max_frwr_depth) - nsegs = ia->ri_max_frwr_depth; + if (nsegs > ep->re_max_fr_depth) + nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) sg_set_page(&mr->mr_sg[i], @@ -308,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) + if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) @@ -317,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; - dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; @@ -391,7 +386,6 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -411,7 +405,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(ia->ri_id->qp, post_wr, NULL); + return ib_post_send(r_xprt->rx_ep.re_id->qp, post_wr, NULL); } /** @@ -538,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -643,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 28020ec104d4..ad7e6b0187bd 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /** * rpcrdma_set_max_header_sizes - Initialize inline payload sizes - * @r_xprt: transport instance to initialize + * @ep: endpoint to initialize * * The max_inline fields contain the maximum size of an RPC message * so the marshaling code doesn't have to repeat this calculation * for every RPC. */ -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { - unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int maxsegs = ep->re_max_rdma_segs; - ep->rep_max_inline_send = - ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); - ep->rep_max_inline_recv = - ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); + ep->re_max_inline_send = + ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->re_max_inline_recv = + ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -134,7 +133,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdr = &rqst->rq_snd_buf; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) + if (xdr->len > r_xprt->rx_ep.re_max_inline_send) return false; if (xdr->page_len) { @@ -145,7 +144,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge) + if (++count > r_xprt->rx_ep.re_attr.cap.max_send_sge) return false; } } @@ -162,7 +161,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -176,7 +175,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.rep_max_inline_recv; + r_xprt->rx_ep.re_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not @@ -255,7 +254,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ - if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_readch && r_xprt->rx_ep.re_implicit_roundup) goto out; /* When encoding a Write chunk, some servers need to see an @@ -263,7 +262,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * layer provides space in the tail iovec that may be used * for this purpose. */ - if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_writech && r_xprt->rx_ep.re_implicit_roundup) goto out; if (xdrbuf->tail[0].iov_len) @@ -1476,8 +1475,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.rep_max_requests) - credits = r_xprt->rx_ep.rep_max_requests; + else if (credits > r_xprt->rx_ep.re_max_requests) + credits = r_xprt->rx_ep.re_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); rpcrdma_post_recvs(r_xprt, false); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d7b7dab0aeb6..4352fd6e9817 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -238,11 +238,12 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (r_xprt->rx_ep.rep_connected > 0) { + if (ep->re_connect_status > 0) { xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; @@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ia.ri_id); + rdma_disconnect(r_xprt->rx_ep.re_id); } /** @@ -355,6 +356,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; dprintk("RPC: %s: %s:%s\n", __func__, @@ -489,10 +491,11 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned long delay; delay = 0; - if (r_xprt->rx_ep.rep_connected != 0) { + if (ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index dfe680e3234a..10826982ddf8 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -97,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rdma_cm_id *id = r_xprt->rx_ep.re_id; /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_rq(ia->ri_id->qp); + ib_drain_rq(id->qp); /* Deferred Reply processing might have scheduled * local invalidations. */ - ib_drain_sq(ia->ri_id->qp); + ib_drain_sq(id->qp); } /** @@ -140,8 +140,9 @@ void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - if (wc->status != IB_WC_SUCCESS && r_xprt->rx_ep.rep_connected == 1) { - r_xprt->rx_ep.rep_connected = -ECONNABORTED; + if (wc->status != IB_WC_SUCCESS && + r_xprt->rx_ep.re_connect_status == 1) { + r_xprt->rx_ep.re_connect_status = -ECONNABORTED; trace_xprtrdma_flush_dct(r_xprt, wc->status); xprt_force_disconnect(xprt); } @@ -180,7 +181,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.rep_receive_count; + --r_xprt->rx_ep.re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -209,24 +210,24 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; + ep->re_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_implicit_roundup = true; + ep->re_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < ep->rep_inline_recv) - ep->rep_inline_recv = rsize; - if (wsize < ep->rep_inline_send) - ep->rep_inline_send = wsize; + if (rsize < ep->re_inline_recv) + ep->re_inline_recv = rsize; + if (wsize < ep->re_inline_send) + ep->re_inline_send = wsize; - rpcrdma_set_max_header_sizes(r_xprt); + rpcrdma_set_max_header_sizes(ep); } /** @@ -241,7 +242,6 @@ static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpc_xprt *xprt = &r_xprt->rx_xprt; @@ -251,57 +251,57 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: - ia->ri_async_rc = 0; - complete(&ia->ri_done); + ep->re_async_rc = 0; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ADDR_ERROR: - ia->ri_async_rc = -EPROTO; - complete(&ia->ri_done); + ep->re_async_rc = -EPROTO; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ROUTE_ERROR: - ia->ri_async_rc = -ENETUNREACH; - complete(&ia->ri_done); + ep->re_async_rc = -ENETUNREACH; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_id->device->name, + ep->re_id->device->name, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif - init_completion(&ia->ri_remove_done); - ep->rep_connected = -ENODEV; + init_completion(&ep->re_remove_done); + ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); - wait_for_completion(&ia->ri_remove_done); + wait_for_completion(&ep->re_remove_done); trace_xprtrdma_remove(r_xprt); /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; - ep->rep_connected = 1; + ep->re_connect_status = 1; rpcrdma_update_cm_private(r_xprt, &event->param.conn); trace_xprtrdma_inline_thresh(r_xprt); - wake_up_all(&ep->rep_connect_wait); + wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: - ep->rep_connected = -ENOTCONN; + ep->re_connect_status = -ENOTCONN; goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - ep->rep_connected = -ENETUNREACH; + ep->re_connect_status = -ENETUNREACH; goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - ep->rep_connected = -ECONNREFUSED; + ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - ep->rep_connected = -EAGAIN; + ep->re_connect_status = -EAGAIN; goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ep->rep_connected = -ECONNABORTED; + ep->re_connect_status = -ECONNABORTED; disconnected: xprt_force_disconnect(xprt); - wake_up_all(&ep->rep_connect_wait); + wake_up_all(&ep->re_connect_wait); break; default: break; @@ -309,46 +309,46 @@ disconnected: dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_id->device->name, rdma_event_msg(event->event)); + ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } -static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) +static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rdma_cm_id *id; int rc; - init_completion(&ia->ri_done); + init_completion(&ep->re_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, - xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, r_xprt, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; - ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, - (struct sockaddr *)&xprt->rx_xprt.addr, + ep->re_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; - ia->ri_async_rc = -ETIMEDOUT; + ep->re_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; @@ -366,102 +366,101 @@ out: static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + struct rpcrdma_connect_private *pmsg = &ep->re_cm_private; struct rdma_cm_id *id; int rc; - id = rpcrdma_create_id(r_xprt, ia); + id = rpcrdma_create_id(r_xprt, ep); if (IS_ERR(id)) return PTR_ERR(id); - ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; - ep->rep_inline_send = xprt_rdma_max_inline_write; - ep->rep_inline_recv = xprt_rdma_max_inline_read; - - rc = frwr_query_device(r_xprt, id->device); + ep->re_max_requests = r_xprt->rx_xprt.max_reqs; + ep->re_inline_send = xprt_rdma_max_inline_write; + ep->re_inline_recv = xprt_rdma_max_inline_read; + rc = frwr_query_device(ep, id->device); if (rc) goto out_destroy; - r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); - ep->rep_attr.event_handler = rpcrdma_qp_event_handler; - ep->rep_attr.qp_context = ep; - ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_inline_data = 0; - ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - ep->rep_attr.qp_type = IB_QPT_RC; - ep->rep_attr.port_num = ~0; + ep->re_attr.event_handler = rpcrdma_qp_event_handler; + ep->re_attr.qp_context = ep; + ep->re_attr.srq = NULL; + ep->re_attr.cap.max_inline_data = 0; + ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->re_attr.qp_type = IB_QPT_RC; + ep->re_attr.port_num = ~0; dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, - ep->rep_attr.cap.max_send_wr, - ep->rep_attr.cap.max_recv_wr, - ep->rep_attr.cap.max_send_sge, - ep->rep_attr.cap.max_recv_sge); - - ep->rep_send_batch = ep->rep_max_requests >> 3; - ep->rep_send_count = ep->rep_send_batch; - init_waitqueue_head(&ep->rep_connect_wait); - ep->rep_receive_count = 0; - - ep->rep_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, - ep->rep_attr.cap.max_send_wr, - IB_POLL_WORKQUEUE); - if (IS_ERR(ep->rep_attr.send_cq)) { - rc = PTR_ERR(ep->rep_attr.send_cq); + ep->re_attr.cap.max_send_wr, + ep->re_attr.cap.max_recv_wr, + ep->re_attr.cap.max_send_sge, + ep->re_attr.cap.max_recv_sge); + + ep->re_send_batch = ep->re_max_requests >> 3; + ep->re_send_count = ep->re_send_batch; + init_waitqueue_head(&ep->re_connect_wait); + + ep->re_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.cap.max_send_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.send_cq)) { + rc = PTR_ERR(ep->re_attr.send_cq); goto out_destroy; } - ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, - ep->rep_attr.cap.max_recv_wr, - IB_POLL_WORKQUEUE); - if (IS_ERR(ep->rep_attr.recv_cq)) { - rc = PTR_ERR(ep->rep_attr.recv_cq); + ep->re_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.cap.max_recv_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.recv_cq)) { + rc = PTR_ERR(ep->re_attr.recv_cq); goto out_destroy; } + ep->re_receive_count = 0; /* Initialize cma parameters */ - memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); + memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); - ep->rep_remote_cma.private_data = pmsg; - ep->rep_remote_cma.private_data_len = sizeof(*pmsg); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv); + ep->re_remote_cma.private_data = pmsg; + ep->re_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ - ep->rep_remote_cma.initiator_depth = 0; - ep->rep_remote_cma.responder_resources = + ep->re_remote_cma.initiator_depth = 0; + ep->re_remote_cma.responder_resources = min_t(int, U8_MAX, id->device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing * transport connection and retransmission. */ - ep->rep_remote_cma.retry_count = 6; + ep->re_remote_cma.retry_count = 6; /* RPC-over-RDMA handles its own flow control. In addition, * make all RNR NAKs visible so we know that RPC-over-RDMA * flow control is working correctly (no NAKs should be seen). */ - ep->rep_remote_cma.flow_control = 0; - ep->rep_remote_cma.rnr_retry_count = 0; + ep->re_remote_cma.flow_control = 0; + ep->re_remote_cma.rnr_retry_count = 0; - ia->ri_pd = ib_alloc_pd(id->device, 0); - if (IS_ERR(ia->ri_pd)) { - rc = PTR_ERR(ia->ri_pd); + ep->re_pd = ib_alloc_pd(id->device, 0); + if (IS_ERR(ep->re_pd)) { + rc = PTR_ERR(ep->re_pd); goto out_destroy; } - rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr); if (rc) goto out_destroy; - ia->ri_id = id; + + ep->re_id = id; return 0; out_destroy: @@ -473,23 +472,22 @@ out_destroy: static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - if (ia->ri_id && ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; + if (ep->re_id && ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; } - if (ep->rep_attr.recv_cq) - ib_free_cq(ep->rep_attr.recv_cq); - ep->rep_attr.recv_cq = NULL; - if (ep->rep_attr.send_cq) - ib_free_cq(ep->rep_attr.send_cq); - ep->rep_attr.send_cq = NULL; + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; - if (ia->ri_pd) - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; } /* @@ -499,7 +497,6 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; retry: @@ -508,7 +505,7 @@ retry: if (rc) goto out_noupdate; - ep->rep_connected = 0; + ep->re_connect_status = 0; xprt_clear_connected(xprt); rpcrdma_reset_cwnd(r_xprt); @@ -518,17 +515,18 @@ retry: if (rc) goto out; - rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) goto out; if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - if (ep->rep_connected <= 0) { - if (ep->rep_connected == -EAGAIN) + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 0); + if (ep->re_connect_status <= 0) { + if (ep->re_connect_status == -EAGAIN) goto retry; - rc = ep->rep_connected; + rc = ep->re_connect_status; goto out; } @@ -541,7 +539,7 @@ retry: out: if (rc) - ep->rep_connected = rc; + ep->re_connect_status = rc; out_noupdate: trace_xprtrdma_connect(r_xprt, rc); @@ -558,9 +556,8 @@ out_noupdate: void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id = ia->ri_id; - int rc, status = ep->rep_connected; + struct rdma_cm_id *id = ep->re_id; + int rc, status = ep->re_connect_status; might_sleep(); @@ -569,10 +566,10 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rc = rdma_disconnect(id); if (!rc) - wait_event_interruptible(ep->rep_connect_wait, - ep->rep_connected != 1); + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 1); else - ep->rep_connected = rc; + ep->re_connect_status = rc; trace_xprtrdma_disconnect(r_xprt, rc); if (id->qp) @@ -585,10 +582,10 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rpcrdma_ep_destroy(r_xprt); if (status == -ENODEV) - complete(&ia->ri_remove_done); + complete(&ep->re_remove_done); else rdma_destroy_id(id); - ia->ri_id = NULL; + ep->re_id = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -625,7 +622,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge), + sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), GFP_KERNEL); if (!sc) return NULL; @@ -645,7 +642,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS; + i = r_xprt->rx_ep.re_max_requests + RPCRDMA_MAX_BC_REQUESTS; buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); if (!buf->rb_sc_ctxs) return -ENOMEM; @@ -756,10 +753,10 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int count; - for (count = 0; count < ia->ri_max_rdma_segs; count++) { + for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -808,7 +805,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) /* If there is no underlying connection, it's no use * to wake the refresh worker. */ - if (ep->rep_connected == 1) { + if (ep->re_connect_status == 1) { /* The work is scheduled on a WQ_MEM_RECLAIM * workqueue in order to prevent MR allocation * from recursing into NFS during direct reclaim. @@ -872,7 +869,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep.re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), DMA_TO_DEVICE, GFP_KERNEL); @@ -950,7 +947,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.re_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; @@ -1175,7 +1172,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1293,7 +1290,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ia.ri_id->device; + struct ib_device *device = r_xprt->rx_ep.re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1306,7 +1303,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep.re_pd->local_dma_lkey; return true; } @@ -1345,12 +1342,12 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; - if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { + if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { send_wr->send_flags |= IB_SEND_SIGNALED; - ep->rep_send_count = ep->rep_send_batch; + ep->re_send_count = ep->re_send_batch; } else { send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->rep_send_count; + --ep->re_send_count; } rc = frwr_send(r_xprt, req); @@ -1378,9 +1375,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (likely(ep->rep_receive_count > needed)) + if (likely(ep->re_receive_count > needed)) goto out; - needed -= ep->rep_receive_count; + needed -= ep->re_receive_count; if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; @@ -1406,7 +1403,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, + rc = ib_post_recv(r_xprt->rx_ep.re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); out: trace_xprtrdma_post_recvs(r_xprt, count, rc); @@ -1420,6 +1417,6 @@ out: --count; } } - ep->rep_receive_count += count; + ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8a3ac9d7ee81..f3c0b826c9ed 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -65,38 +65,32 @@ #define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) /* - * Interface Adapter -- one per transport instance + * RDMA Endpoint -- connection endpoint details */ -struct rpcrdma_ia { - struct rdma_cm_id *ri_id; - struct ib_pd *ri_pd; - int ri_async_rc; - unsigned int ri_max_rdma_segs; - unsigned int ri_max_frwr_depth; - bool ri_implicit_roundup; - enum ib_mr_type ri_mrtype; - struct completion ri_done; - struct completion ri_remove_done; -}; - -/* - * RDMA Endpoint -- one per transport instance - */ - struct rpcrdma_ep { - unsigned int rep_send_count; - unsigned int rep_send_batch; - unsigned int rep_max_inline_send; - unsigned int rep_max_inline_recv; - int rep_connected; - struct ib_qp_init_attr rep_attr; - wait_queue_head_t rep_connect_wait; - struct rpcrdma_connect_private rep_cm_private; - struct rdma_conn_param rep_remote_cma; - unsigned int rep_max_requests; /* depends on device */ - unsigned int rep_inline_send; /* negotiated */ - unsigned int rep_inline_recv; /* negotiated */ - int rep_receive_count; + struct rdma_cm_id *re_id; + struct ib_pd *re_pd; + unsigned int re_max_rdma_segs; + unsigned int re_max_fr_depth; + bool re_implicit_roundup; + enum ib_mr_type re_mrtype; + struct completion re_done; + struct completion re_remove_done; + unsigned int re_send_count; + unsigned int re_send_batch; + unsigned int re_max_inline_send; + unsigned int re_max_inline_recv; + int re_async_rc; + int re_connect_status; + struct ib_qp_init_attr re_attr; + wait_queue_head_t re_connect_wait; + struct rpcrdma_connect_private + re_cm_private; + struct rdma_conn_param re_remote_cma; + int re_receive_count; + unsigned int re_max_requests; /* depends on device */ + unsigned int re_inline_send; /* negotiated */ + unsigned int re_inline_recv; /* negotiated */ }; /* Pre-allocate extra Work Requests for handling backward receives @@ -417,7 +411,6 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ia rx_ia; struct rpcrdma_ep rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; @@ -522,8 +515,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ void frwr_reset(struct rpcrdma_req *req); -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device); +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, @@ -555,7 +547,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype); void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); -- cgit v1.2.3 From 745b734c9bb80559b8ca64ae688901afefc1c3fd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:01:00 -0500 Subject: xprtrdma: Extract sockaddr from struct rdma_cm_id rpcrdma_cm_event_handler() is always passed an @id pointer that is valid. However, in a subsequent patch, we won't be able to extract an r_xprt in every case. So instead of using the r_xprt's presentation address strings, extract them from struct rdma_cm_id. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 78 ++++++++++++++++++++++++++++-------------- net/sunrpc/xprtrdma/verbs.c | 35 ++++++++----------- 2 files changed, 67 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 843269f0e291..295f75b9b796 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -375,47 +375,74 @@ TRACE_EVENT(xprtrdma_cm_event, TRACE_EVENT(xprtrdma_inline_thresh, TP_PROTO( - const struct rpcrdma_xprt *r_xprt + const struct rpcrdma_ep *ep ), - TP_ARGS(r_xprt), + TP_ARGS(ep), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned int, inline_send) __field(unsigned int, inline_recv) __field(unsigned int, max_send) __field(unsigned int, max_recv) - __string(addr, rpcrdma_addrstr(r_xprt)) - __string(port, rpcrdma_portstr(r_xprt)) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - const struct rpcrdma_ep *ep = &r_xprt->rx_ep; + const struct rdma_cm_id *id = ep->re_id; - __entry->r_xprt = r_xprt; __entry->inline_send = ep->re_inline_send; __entry->inline_recv = ep->re_inline_recv; __entry->max_send = ep->re_max_inline_send; __entry->max_recv = ep->re_max_inline_recv; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + memcpy(__entry->srcaddr, &id->route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); ), - TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("%pISpc -> %pISpc neg send/recv=%u/%u, calc send/recv=%u/%u", + __entry->srcaddr, __entry->dstaddr, __entry->inline_send, __entry->inline_recv, __entry->max_send, __entry->max_recv ) ); +TRACE_EVENT(xprtrdma_remove, + TP_PROTO( + const struct rpcrdma_ep *ep + ), + + TP_ARGS(ep), + + TP_STRUCT__entry( + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + __string(name, ep->re_id->device->name) + ), + + TP_fast_assign( + const struct rdma_cm_id *id = ep->re_id; + + memcpy(__entry->srcaddr, &id->route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + __assign_str(name, id->device->name); + ), + + TP_printk("%pISpc -> %pISpc device=%s", + __entry->srcaddr, __entry->dstaddr, __get_str(name) + ) +); + DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); DEFINE_CONN_EVENT(flush_dct); DEFINE_RXPRT_EVENT(xprtrdma_create); DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); -DEFINE_RXPRT_EVENT(xprtrdma_remove); DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); DEFINE_RXPRT_EVENT(xprtrdma_op_close); DEFINE_RXPRT_EVENT(xprtrdma_op_setport); @@ -482,32 +509,33 @@ TRACE_EVENT(xprtrdma_op_set_cto, TRACE_EVENT(xprtrdma_qp_event, TP_PROTO( - const struct rpcrdma_xprt *r_xprt, + const struct rpcrdma_ep *ep, const struct ib_event *event ), - TP_ARGS(r_xprt, event), + TP_ARGS(ep, event), TP_STRUCT__entry( - __field(const void *, r_xprt) - __field(unsigned int, event) + __field(unsigned long, event) __string(name, event->device->name) - __string(addr, rpcrdma_addrstr(r_xprt)) - __string(port, rpcrdma_portstr(r_xprt)) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; + const struct rdma_cm_id *id = ep->re_id; + __entry->event = event->event; __assign_str(name, event->device->name); - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + memcpy(__entry->srcaddr, &id->route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)", - __get_str(addr), __get_str(port), __entry->r_xprt, - __get_str(name), rdma_show_ib_event(__entry->event), - __entry->event + TP_printk("%pISpc -> %pISpc device=%s %s (%lu)", + __entry->srcaddr, __entry->dstaddr, __get_str(name), + rdma_show_ib_event(__entry->event), __entry->event ) ); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 10826982ddf8..37d07072bdbf 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -116,16 +116,14 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * @context: ep that owns QP where event occurred * * Called from the RDMA provider (device driver) possibly in an interrupt - * context. + * context. The QP is always destroyed before the ID, so the ID will be + * reliably available when this handler is invoked. */ -static void -rpcrdma_qp_event_handler(struct ib_event *event, void *context) +static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); - trace_xprtrdma_qp_event(r_xprt, event); + trace_xprtrdma_qp_event(ep, event); } /** @@ -202,11 +200,10 @@ out_flushed: rpcrdma_rep_destroy(rep); } -static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, +static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ @@ -241,6 +238,7 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_xprt *r_xprt = id->context; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpc_xprt *xprt = &r_xprt->rx_xprt; @@ -263,24 +261,21 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %s:%s\n", - ep->re_id->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); -#endif + pr_info("rpcrdma: removing device %s for %pISpc\n", + ep->re_id->device->name, sap); init_completion(&ep->re_remove_done); ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); wait_for_completion(&ep->re_remove_done); - trace_xprtrdma_remove(r_xprt); + trace_xprtrdma_remove(ep); /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; ep->re_connect_status = 1; - rpcrdma_update_cm_private(r_xprt, &event->param.conn); - trace_xprtrdma_inline_thresh(r_xprt); + rpcrdma_update_cm_private(ep, &event->param.conn); + trace_xprtrdma_inline_thresh(ep); wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -290,9 +285,8 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_connect_status = -ENETUNREACH; goto disconnected; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - rdma_reject_msg(id, event->status)); + dprintk("rpcrdma: connection to %pISpc rejected: %s\n", + sap, rdma_reject_msg(id, event->status)); ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) ep->re_connect_status = -EAGAIN; @@ -307,8 +301,7 @@ disconnected: break; } - dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap, ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } -- cgit v1.2.3 From e28ce90083f032ca0e8ea03478f5b6a38f5930f7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:01:05 -0500 Subject: xprtrdma: kmalloc rpcrdma_ep separate from rpcrdma_xprt Change the rpcrdma_xprt_disconnect() function so that it no longer waits for the DISCONNECTED event. This prevents blocking if the remote is unresponsive. In rpcrdma_xprt_disconnect(), the transport's rpcrdma_ep is detached. Upon return from rpcrdma_xprt_disconnect(), the transport (r_xprt) is ready immediately for a new connection. The RDMA_CM_DEVICE_REMOVAL and RDMA_CM_DISCONNECTED events are now handled almost identically. However, because the lifetimes of rpcrdma_xprt structures and rpcrdma_ep structures are now independent, creating an rpcrdma_ep needs to take a module ref count. The ep now owns most of the hardware resources for a transport. Also, a kref is needed to ensure that rpcrdma_ep sticks around long enough for the cm_event_handler to finish. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 63 +------------ net/sunrpc/xprtrdma/backchannel.c | 4 +- net/sunrpc/xprtrdma/frwr_ops.c | 12 +-- net/sunrpc/xprtrdma/rpc_rdma.c | 17 ++-- net/sunrpc/xprtrdma/transport.c | 37 ++++---- net/sunrpc/xprtrdma/verbs.c | 194 ++++++++++++++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 7 +- 7 files changed, 143 insertions(+), 191 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 295f75b9b796..81b87428f166 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -104,7 +104,7 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_fast_assign( __entry->r_xprt = r_xprt; __entry->rc = rc; - __entry->connect_status = r_xprt->rx_ep.re_connect_status; + __entry->connect_status = r_xprt->rx_ep->re_connect_status; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), @@ -342,37 +342,6 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event, ** Connection events **/ -TRACE_EVENT(xprtrdma_cm_event, - TP_PROTO( - const struct rpcrdma_xprt *r_xprt, - struct rdma_cm_event *event - ), - - TP_ARGS(r_xprt, event), - - TP_STRUCT__entry( - __field(const void *, r_xprt) - __field(unsigned int, event) - __field(int, status) - __string(addr, rpcrdma_addrstr(r_xprt)) - __string(port, rpcrdma_portstr(r_xprt)) - ), - - TP_fast_assign( - __entry->r_xprt = r_xprt; - __entry->event = event->event; - __entry->status = event->status; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); - ), - - TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)", - __get_str(addr), __get_str(port), - __entry->r_xprt, rdma_show_cm_event(__entry->event), - __entry->event, __entry->status - ) -); - TRACE_EVENT(xprtrdma_inline_thresh, TP_PROTO( const struct rpcrdma_ep *ep @@ -409,34 +378,6 @@ TRACE_EVENT(xprtrdma_inline_thresh, ) ); -TRACE_EVENT(xprtrdma_remove, - TP_PROTO( - const struct rpcrdma_ep *ep - ), - - TP_ARGS(ep), - - TP_STRUCT__entry( - __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) - __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) - __string(name, ep->re_id->device->name) - ), - - TP_fast_assign( - const struct rdma_cm_id *id = ep->re_id; - - memcpy(__entry->srcaddr, &id->route.addr.src_addr, - sizeof(struct sockaddr_in6)); - memcpy(__entry->dstaddr, &id->route.addr.dst_addr, - sizeof(struct sockaddr_in6)); - __assign_str(name, id->device->name); - ), - - TP_printk("%pISpc -> %pISpc device=%s", - __entry->srcaddr, __entry->dstaddr, __get_str(name) - ) -); - DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); DEFINE_CONN_EVENT(flush_dct); @@ -831,7 +772,7 @@ TRACE_EVENT(xprtrdma_post_recvs, __entry->r_xprt = r_xprt; __entry->count = count; __entry->status = status; - __entry->posted = r_xprt->rx_ep.re_receive_count; + __entry->posted = r_xprt->rx_ep->re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 4b20102cf060..c92c1aac270a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,7 +44,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; size_t maxmsg; maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); @@ -190,7 +190,7 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.re_inline_recv, PAGE_SIZE); + size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 19bf422f010b..ef997880e17a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -115,7 +115,7 @@ void frwr_reset(struct rpcrdma_req *req) */ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; @@ -283,7 +283,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; @@ -405,7 +405,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(r_xprt->rx_ep.re_id->qp, post_wr, NULL); + return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); } /** @@ -535,7 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -640,7 +640,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ad7e6b0187bd..d1af48e0139c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -131,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { struct xdr_buf *xdr = &rqst->rq_snd_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.re_max_inline_send) + if (xdr->len > ep->re_max_inline_send) return false; if (xdr->page_len) { @@ -144,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.re_attr.cap.max_send_sge) + if (++count > ep->re_attr.cap.max_send_sge) return false; } } @@ -161,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.re_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -175,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.re_max_inline_recv; + r_xprt->rx_ep->re_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not @@ -254,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ - if (type == rpcrdma_readch && r_xprt->rx_ep.re_implicit_roundup) + if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup) goto out; /* When encoding a Write chunk, some servers need to see an @@ -262,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * layer provides space in the tail iovec that may be used * for this purpose. */ - if (type == rpcrdma_writech && r_xprt->rx_ep.re_implicit_roundup) + if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup) goto out; if (xdrbuf->tail[0].iov_len) @@ -1475,8 +1476,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.re_max_requests) - credits = r_xprt->rx_ep.re_max_requests; + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); rpcrdma_post_recvs(r_xprt, false); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 4352fd6e9817..659da37020a4 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -238,12 +238,12 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (ep->re_connect_status > 0) { + if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) { + xprt->connect_cookie++; xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; @@ -266,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ep.re_id); + rdma_disconnect(r_xprt->rx_ep->re_id); } /** @@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args) if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-EIO); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, xprt_rdma_slot_table_entries); - if (!xprt) + if (!xprt) { + module_put(THIS_MODULE); return ERR_PTR(-ENOMEM); + } xprt->timeout = &xprt_rdma_default_timeout; xprt->connect_timeout = xprt->timeout->to_initval; @@ -348,11 +353,12 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); rc = rpcrdma_buffer_create(new_xprt); - if (rc) - goto out2; - - if (!try_module_get(THIS_MODULE)) - goto out4; + if (rc) { + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); + return ERR_PTR(rc); + } INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); @@ -364,15 +370,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->address_strings[RPC_DISPLAY_PORT]); trace_xprtrdma_create(new_xprt); return xprt; - -out4: - rpcrdma_buffer_destroy(&new_xprt->rx_buf); - rc = -ENODEV; -out2: - trace_xprtrdma_op_destroy(new_xprt); - xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - return ERR_PTR(rc); } /** @@ -491,11 +488,11 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; delay = 0; - if (ep->re_connect_status != 0) { + if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 37d07072bdbf..cdd84c09df10 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -84,7 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); +static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -97,7 +97,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rdma_cm_id *id = r_xprt->rx_ep.re_id; + struct rdma_cm_id *id = r_xprt->rx_ep->re_id; /* Flush Receives, then wait for deferred Reply work * to complete. @@ -139,8 +139,8 @@ void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) struct rpc_xprt *xprt = &r_xprt->rx_xprt; if (wc->status != IB_WC_SUCCESS && - r_xprt->rx_ep.re_connect_status == 1) { - r_xprt->rx_ep.re_connect_status = -ECONNABORTED; + r_xprt->rx_ep->re_connect_status == 1) { + r_xprt->rx_ep->re_connect_status = -ECONNABORTED; trace_xprtrdma_flush_dct(r_xprt, wc->status); xprt_force_disconnect(xprt); } @@ -179,7 +179,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.re_receive_count; + --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -239,13 +239,11 @@ static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; - struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_ep *ep = id->context; + struct rpc_xprt *xprt = ep->re_xprt; might_sleep(); - trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -263,16 +261,13 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: pr_info("rpcrdma: removing device %s for %pISpc\n", ep->re_id->device->name, sap); - init_completion(&ep->re_remove_done); + /* fall through */ + case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); - wait_for_completion(&ep->re_remove_done); - trace_xprtrdma_remove(ep); - - /* Return 1 to ensure the core destroys the id. */ - return 1; + goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->connect_cookie; + kref_get(&ep->re_kref); ep->re_connect_status = 1; rpcrdma_update_cm_private(ep, &event->param.conn); trace_xprtrdma_inline_thresh(ep); @@ -294,9 +289,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: ep->re_connect_status = -ECONNABORTED; disconnected: - xprt_force_disconnect(xprt); - wake_up_all(&ep->re_connect_wait); - break; + return rpcrdma_ep_destroy(ep); default: break; } @@ -316,7 +309,7 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, init_completion(&ep->re_done); - id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, r_xprt, + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; @@ -352,25 +345,66 @@ out: return ERR_PTR(rc); } -/* - * Exported functions. +static void rpcrdma_ep_put(struct kref *kref) +{ + struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); + + if (ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; + } + + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; + + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; + + kfree(ep); + module_put(THIS_MODULE); +} + +/* Returns: + * %0 if @ep still has a positive kref count, or + * %1 if @ep was destroyed successfully. */ +static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep) +{ + return kref_put(&ep->re_kref, rpcrdma_ep_put); +} static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_connect_private *pmsg = &ep->re_cm_private; + struct rpcrdma_connect_private *pmsg; + struct ib_device *device; struct rdma_cm_id *id; + struct rpcrdma_ep *ep; int rc; + ep = kzalloc(sizeof(*ep), GFP_NOFS); + if (!ep) + return -EAGAIN; + ep->re_xprt = &r_xprt->rx_xprt; + kref_init(&ep->re_kref); + id = rpcrdma_create_id(r_xprt, ep); - if (IS_ERR(id)) - return PTR_ERR(id); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + goto out_free; + } + __module_get(THIS_MODULE); + device = id->device; + ep->re_id = id; ep->re_max_requests = r_xprt->rx_xprt.max_reqs; ep->re_inline_send = xprt_rdma_max_inline_write; ep->re_inline_recv = xprt_rdma_max_inline_read; - rc = frwr_query_device(ep, id->device); + rc = frwr_query_device(ep, device); if (rc) goto out_destroy; @@ -396,7 +430,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_send_count = ep->re_send_batch; init_waitqueue_head(&ep->re_connect_wait); - ep->re_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt, ep->re_attr.cap.max_send_wr, IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.send_cq)) { @@ -404,7 +438,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) goto out_destroy; } - ep->re_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt, ep->re_attr.cap.max_recv_wr, IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.recv_cq)) { @@ -417,6 +451,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ + pmsg = &ep->re_cm_private; pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; @@ -428,7 +463,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) /* Client offers RDMA Read but does not initiate */ ep->re_remote_cma.initiator_depth = 0; ep->re_remote_cma.responder_resources = - min_t(int, U8_MAX, id->device->attrs.max_qp_rd_atom); + min_t(int, U8_MAX, device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -443,7 +478,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_remote_cma.flow_control = 0; ep->re_remote_cma.rnr_retry_count = 0; - ep->re_pd = ib_alloc_pd(id->device, 0); + ep->re_pd = ib_alloc_pd(device, 0); if (IS_ERR(ep->re_pd)) { rc = PTR_ERR(ep->re_pd); goto out_destroy; @@ -453,50 +488,36 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) if (rc) goto out_destroy; - ep->re_id = id; + r_xprt->rx_ep = ep; return 0; out_destroy: - rpcrdma_ep_destroy(r_xprt); + rpcrdma_ep_destroy(ep); rdma_destroy_id(id); +out_free: + kfree(ep); + r_xprt->rx_ep = NULL; return rc; } -static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - if (ep->re_id && ep->re_id->qp) { - rdma_destroy_qp(ep->re_id); - ep->re_id->qp = NULL; - } - - if (ep->re_attr.recv_cq) - ib_free_cq(ep->re_attr.recv_cq); - ep->re_attr.recv_cq = NULL; - if (ep->re_attr.send_cq) - ib_free_cq(ep->re_attr.send_cq); - ep->re_attr.send_cq = NULL; - - if (ep->re_pd) - ib_dealloc_pd(ep->re_pd); - ep->re_pd = NULL; -} - -/* - * Connect unconnected endpoint. +/** + * rpcrdma_xprt_connect - Connect an unconnected transport + * @r_xprt: controlling transport instance + * + * Returns 0 on success or a negative errno. */ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep; int rc; retry: rpcrdma_xprt_disconnect(r_xprt); rc = rpcrdma_ep_create(r_xprt); if (rc) - goto out_noupdate; + return rc; + ep = r_xprt->rx_ep; ep->re_connect_status = 0; xprt_clear_connected(xprt); @@ -533,8 +554,6 @@ retry: out: if (rc) ep->re_connect_status = rc; - -out_noupdate: trace_xprtrdma_connect(r_xprt, rc); return rc; } @@ -545,40 +564,33 @@ out_noupdate: * * Caller serializes. Either the transport send lock is held, * or we're being called to destroy the transport. + * + * On return, @r_xprt is completely divested of all hardware + * resources and prepared for the next ->connect operation. */ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rdma_cm_id *id = ep->re_id; - int rc, status = ep->re_connect_status; - - might_sleep(); + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id; + int rc; - if (!id) + if (!ep) return; + id = ep->re_id; rc = rdma_disconnect(id); - if (!rc) - wait_event_interruptible(ep->re_connect_wait, - ep->re_connect_status != 1); - else - ep->re_connect_status = rc; trace_xprtrdma_disconnect(r_xprt, rc); - if (id->qp) - rpcrdma_xprt_drain(r_xprt); + rpcrdma_xprt_drain(r_xprt); rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); - rpcrdma_ep_destroy(r_xprt); - - if (status == -ENODEV) - complete(&ep->re_remove_done); - else + if (rpcrdma_ep_destroy(ep)) rdma_destroy_id(id); - ep->re_id = NULL; + + r_xprt->rx_ep = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -635,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.re_max_requests + RPCRDMA_MAX_BC_REQUESTS; + i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); if (!buf->rb_sc_ctxs) return -ENOMEM; buf->rb_sc_last = i - 1; for (i = 0; i <= buf->rb_sc_last; i++) { - sc = rpcrdma_sendctx_create(&r_xprt->rx_ep); + sc = rpcrdma_sendctx_create(r_xprt->rx_ep); if (!sc) return -ENOMEM; @@ -746,7 +758,7 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count; for (count = 0; count < ep->re_max_rdma_segs; count++) { @@ -793,7 +805,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; /* If there is no underlying connection, it's no use * to wake the refresh worker. @@ -862,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ep.re_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), DMA_TO_DEVICE, GFP_KERNEL); @@ -940,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.re_inline_recv, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; @@ -1165,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1283,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ep.re_id->device; + struct ib_device *device = r_xprt->rx_ep->re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1296,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ep.re_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey; return true; } @@ -1332,7 +1344,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_wr; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; int rc; if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { @@ -1359,7 +1371,7 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; int needed, count, rc; @@ -1396,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ep.re_id->qp, wr, + rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); out: trace_xprtrdma_post_recvs(r_xprt, count, rc); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f3c0b826c9ed..0a16fdb09b2c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -68,6 +68,7 @@ * RDMA Endpoint -- connection endpoint details */ struct rpcrdma_ep { + struct kref re_kref; struct rdma_cm_id *re_id; struct ib_pd *re_pd; unsigned int re_max_rdma_segs; @@ -75,7 +76,6 @@ struct rpcrdma_ep { bool re_implicit_roundup; enum ib_mr_type re_mrtype; struct completion re_done; - struct completion re_remove_done; unsigned int re_send_count; unsigned int re_send_batch; unsigned int re_max_inline_send; @@ -83,7 +83,8 @@ struct rpcrdma_ep { int re_async_rc; int re_connect_status; struct ib_qp_init_attr re_attr; - wait_queue_head_t re_connect_wait; + wait_queue_head_t re_connect_wait; + struct rpc_xprt *re_xprt; struct rpcrdma_connect_private re_cm_private; struct rdma_conn_param re_remote_cma; @@ -411,7 +412,7 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ep rx_ep; + struct rpcrdma_ep *rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; struct rpc_timeout rx_timeout; -- cgit v1.2.3 From c21e7168848d4ff4158120dbd4464f0d5cfb1456 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Mar 2020 13:36:36 -0400 Subject: NFSv4/pnfs: Support a list of commit arrays in struct pnfs_ds_commit_info When we have multiple layout segments with different lists of mirrored data, we need to track the commits on a per layout segment basis. This patch adds a list to support this tracking in struct pnfs_ds_commit_info. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 + fs/nfs/filelayout/filelayout.c | 5 ++++- fs/nfs/flexfilelayout/flexfilelayout.c | 1 + fs/nfs/pnfs.h | 11 +++++++++++ include/linux/nfs_xdr.h | 1 + 5 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ade2435551c8..f9a73febce02 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -305,6 +305,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); + pnfs_init_ds_commit_info(&dreq->ds_cinfo); dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index bd234394a87c..b051d5d320ba 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1140,7 +1140,10 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return flo != NULL ? &flo->generic_hdr : NULL; + if (flo == NULL) + return NULL; + pnfs_init_ds_commit_info(&flo->commit_info); + return &flo->generic_hdr; } static void diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 19728206e9c6..c9e79c8e62cd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -48,6 +48,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { + pnfs_init_ds_commit_info(&ffl->commit_info); INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); ffl->last_report_time = ktime_get(); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f6b1099aa151..b293afb48d04 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -462,6 +462,12 @@ pnfs_get_ds_info(struct inode *inode) return ld->get_ds_info(inode); } +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ + INIT_LIST_HEAD(&fl_cinfo->commits); +} + static inline void pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node) { @@ -759,6 +765,11 @@ pnfs_get_ds_info(struct inode *inode) return NULL; } +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ +} + static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index e91c917c9c1c..9946787eda72 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1280,6 +1280,7 @@ struct pnfs_commit_array { }; struct pnfs_ds_commit_info { + struct list_head commits; unsigned int nwritten; unsigned int ncommitting; unsigned int nbuckets; -- cgit v1.2.3 From a9901899b649dc80ef75c14d6d78059cae14def7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Mar 2020 16:04:06 -0400 Subject: pNFS: Add infrastructure for cleaning up per-layout commit structures Ensure that both the file and flexfiles layout types clean up when freeing the layout segments. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 16 +++++++ fs/nfs/flexfilelayout/flexfilelayout.c | 11 +++++ fs/nfs/internal.h | 4 +- fs/nfs/pnfs.c | 1 + fs/nfs/pnfs.h | 4 ++ fs/nfs/pnfs_nfs.c | 88 ++++++++++++++++++++++++++++++++-- include/linux/nfs_xdr.h | 1 + 7 files changed, 121 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index b051d5d320ba..ffc5e2af1776 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -750,11 +750,16 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; + struct inode *inode; flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); + inode = flo->generic_hdr.plh_inode; + spin_lock(&inode->i_lock); flo->commit_info.nbuckets = 0; kfree(flo->commit_info.buckets); flo->commit_info.buckets = NULL; + pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg); + spin_unlock(&inode->i_lock); } _filelayout_free_lseg(fl); } @@ -1163,6 +1168,16 @@ filelayout_get_ds_info(struct inode *inode) return &FILELAYOUT_FROM_HDR(layout)->commit_info; } +static void +filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -1176,6 +1191,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, .get_ds_info = &filelayout_get_ds_info, + .release_ds_info = filelayout_release_ds_info, .mark_request_commit = filelayout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index c9e79c8e62cd..8e1393d75cbc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -580,6 +580,7 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) kfree(ffl->commit_info.buckets); ffl->commit_info.buckets = NULL; } + pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg); spin_unlock(&inode->i_lock); } _ff_layout_free_lseg(fls); @@ -2003,6 +2004,15 @@ ff_layout_get_ds_info(struct inode *inode) return &FF_LAYOUT_FROM_HDR(layout)->commit_info; } +static void +ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + static void ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { @@ -2503,6 +2513,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, + .release_ds_info = ff_layout_release_ds_info, .free_deviceid_node = ff_layout_free_deviceid_node, .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4a1adad3740f..683146a51599 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -534,9 +534,11 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) pnfs_bucket_clear_pnfs_ds_commit_verifiers(cinfo->buckets, cinfo->nbuckets); - list_for_each_entry(array, &cinfo->commits, cinfo_list) + rcu_read_lock(); + list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, array->nbuckets); + rcu_read_unlock(); } #else static inline diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6b25117fca5f..eba18f137fb0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -506,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); + INIT_LIST_HEAD(&lseg->pls_commits); refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2ec97b419b56..6c48bd7b4640 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -66,6 +66,7 @@ struct nfs4_pnfs_ds { struct pnfs_layout_segment { struct list_head pls_list; struct list_head pls_lc_list; + struct list_head pls_commits; struct pnfs_layout_range pls_range; refcount_t pls_refcount; u32 pls_seq; @@ -370,6 +371,9 @@ void nfs4_deviceid_purge_client(const struct nfs_client *); /* pnfs_nfs.c */ struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags); void pnfs_free_commit_array(struct pnfs_commit_array *p); +void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg); +void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo); void pnfs_generic_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f895a28b1e26..edad251a6a48 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -118,6 +118,67 @@ pnfs_free_commit_array(struct pnfs_commit_array *p) } EXPORT_SYMBOL_GPL(pnfs_free_commit_array); +static void +pnfs_release_commit_array_locked(struct pnfs_commit_array *array) +{ + list_del_rcu(&array->cinfo_list); + list_del(&array->lseg_list); + pnfs_free_commit_array(array); +} + +static void +pnfs_put_commit_array_locked(struct pnfs_commit_array *array) +{ + if (refcount_dec_and_test(&array->refcount)) + pnfs_release_commit_array_locked(array); +} + +static void +pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode) +{ + if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) { + pnfs_release_commit_array_locked(array); + spin_unlock(&inode->i_lock); + } +} + +static struct pnfs_commit_array * +pnfs_get_commit_array(struct pnfs_commit_array *array) +{ + if (refcount_inc_not_zero(&array->refcount)) + return array; + return NULL; +} + +static void +pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array) +{ + array->lseg = NULL; + list_del_init(&array->lseg_list); + pnfs_put_commit_array_locked(array); +} + +void +pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg); + +void +pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy); + /* * Locks the nfs_page requests for commit and moves them to * @bucket->committing. @@ -177,14 +238,21 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max) max -= cnt; if (!max) return rv; - list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); cnt = pnfs_bucket_scan_array(cinfo, array->buckets, array->nbuckets, max); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); rv += cnt; max -= cnt; if (!max) break; } + rcu_read_unlock(); return rv; } EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists); @@ -230,13 +298,20 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, fl_cinfo->nbuckets, cinfo); fl_cinfo->nwritten -= nwritten; - list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); nwritten = pnfs_bucket_recover_commit_reqs(dst, array->buckets, array->nbuckets, cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); fl_cinfo->nwritten -= nwritten; } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); @@ -330,9 +405,16 @@ pnfs_alloc_ds_commits_list(struct list_head *list, struct pnfs_commit_array *array; unsigned int ret = 0; - list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); ret += pnfs_bucket_alloc_ds_commits(list, array->buckets, array->nbuckets, cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); + } return ret; } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9946787eda72..33be2ee2a248 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1275,6 +1275,7 @@ struct pnfs_commit_array { struct list_head lseg_list; struct pnfs_layout_segment *lseg; struct rcu_head rcu; + refcount_t refcount; unsigned int nbuckets; struct pnfs_commit_bucket buckets[]; }; -- cgit v1.2.3 From 0aa647b7369dd29de0789c321111b2e4668c46b2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Mar 2020 09:50:05 -0400 Subject: NFS: Remove bucket array from struct pnfs_ds_commit_info Remove the unused bucket array in struct pnfs_ds_commit_info. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 - fs/nfs/filelayout/filelayout.c | 75 +-------------------------------- fs/nfs/flexfilelayout/flexfilelayout.c | 76 ---------------------------------- fs/nfs/internal.h | 3 -- fs/nfs/pnfs_nfs.c | 18 -------- include/linux/nfs_xdr.h | 13 ------ 6 files changed, 1 insertion(+), 185 deletions(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4ee26465b510..61f93a0fb0e0 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -217,7 +217,6 @@ static void nfs_direct_req_free(struct kref *kref) struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode); - nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); if (dreq->l_ctx != NULL) nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 795508054a4d..854f350e2599 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -755,72 +755,12 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); inode = flo->generic_hdr.plh_inode; spin_lock(&inode->i_lock); - flo->commit_info.nbuckets = 0; - kfree(flo->commit_info.buckets); - flo->commit_info.buckets = NULL; pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg); spin_unlock(&inode->i_lock); } _filelayout_free_lseg(fl); } -static int -filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size, i; - - if (fl->commit_through_mds) - return 0; - - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - - if (cinfo->ds->nbuckets >= size) { - /* This assumes there is only one IOMODE_RW lseg. What - * we really want to do is have a layout_hdr level - * dictionary of keys, each - * associated with a struct list_head, populated by calls - * to filelayout_write_pagelist(). - * */ - return 0; - } - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; - } - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets >= size) - goto out; - for (i = 0; i < cinfo->ds->nbuckets; i++) { - list_splice(&cinfo->ds->buckets[i].written, - &buckets[i].written); - list_splice(&cinfo->ds->buckets[i].committing, - &buckets[i].committing); - buckets[i].direct_verf.committed = - cinfo->ds->buckets[i].direct_verf.committed; - buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; - buckets[i].clseg = cinfo->ds->buckets[i].clseg; - } - swap(cinfo->ds->buckets, buckets); - cinfo->ds->nbuckets = size; -out: - spin_unlock(&cinfo->inode->i_lock); - kfree(buckets); - return 0; -} - static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, @@ -943,9 +883,6 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - struct nfs_commit_info cinfo; - int status; - pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, @@ -964,17 +901,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - goto out_mds; - } - return; -out_mds: - nfs_pageio_reset_write_mds(pgio); + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f343a241906a..1a4e36d07eab 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -550,17 +550,6 @@ out_err_free: goto out_free_page; } -static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) -{ - struct pnfs_layout_segment *lseg; - - list_for_each_entry(lseg, &layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - return true; - - return false; -} - static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { @@ -575,24 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); inode = ffl->generic_hdr.plh_inode; spin_lock(&inode->i_lock); - if (!ff_layout_has_rw_segments(lseg->pls_layout)) { - ffl->commit_info.nbuckets = 0; - kfree(ffl->commit_info.buckets); - ffl->commit_info.buckets = NULL; - } pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg); spin_unlock(&inode->i_lock); } _ff_layout_free_lseg(fls); } -/* Return 1 until we have multiple lsegs support */ -static int -ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) -{ - return 1; -} - static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -737,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_unlock(&mirror->lock); } -static int -ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size; - - if (cinfo->ds->nbuckets != 0) { - /* This assumes there is only one RW lseg per file. - * To support multiple lseg per file, we need to - * change struct pnfs_commit_bucket to allow dynamic - * increasing nbuckets. - */ - return 0; - } - - size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - else { - int i; - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = - NFS_INVALID_STABLE_HOW; - } - } - spin_unlock(&cinfo->inode->i_lock); - return 0; - } -} - static void ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) { @@ -944,10 +875,8 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, { struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; - struct nfs_commit_info cinfo; struct nfs4_pnfs_ds *ds; int i; - int status; retry: pnfs_generic_pg_check_layout(pgio); @@ -969,11 +898,6 @@ retry: if (pgio->pg_lseg == NULL) goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) - goto out_mds; - /* Use a direct mapping of ds_idx to pgio mirror_idx */ if (WARN_ON_ONCE(pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 683146a51599..78f317fac940 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -531,9 +531,6 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { struct pnfs_commit_array *array; - pnfs_bucket_clear_pnfs_ds_commit_verifiers(cinfo->buckets, - cinfo->nbuckets); - rcu_read_lock(); list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 9b55919e64ac..20f12f3cbe38 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -292,12 +292,6 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max) struct pnfs_commit_array *array; int rv = 0, cnt; - cnt = pnfs_bucket_scan_array(cinfo, fl_cinfo->buckets, - fl_cinfo->nbuckets, max); - rv += cnt; - max -= cnt; - if (!max) - return rv; rcu_read_lock(); list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { if (!array->lseg || !pnfs_get_commit_array(array)) @@ -353,11 +347,6 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, unsigned int nwritten; lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); - nwritten = pnfs_bucket_recover_commit_reqs(dst, - fl_cinfo->buckets, - fl_cinfo->nbuckets, - cinfo); - fl_cinfo->nwritten -= nwritten; rcu_read_lock(); list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { if (!array->lseg || !pnfs_get_commit_array(array)) @@ -412,10 +401,6 @@ pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page struct pnfs_commit_array *array; struct nfs_page *req; - req = pnfs_bucket_search_commit_reqs(fl_cinfo->buckets, - fl_cinfo->nbuckets, page); - if (req) - return req; list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { req = pnfs_bucket_search_commit_reqs(array->buckets, array->nbuckets, page); @@ -550,9 +535,6 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, nreq++; } - nreq += pnfs_bucket_alloc_ds_commits(&list, fl_cinfo->buckets, - fl_cinfo->nbuckets, cinfo); - nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo); if (nreq == 0) goto out; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 33be2ee2a248..2903597ec88c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1284,8 +1284,6 @@ struct pnfs_ds_commit_info { struct list_head commits; unsigned int nwritten; unsigned int ncommitting; - unsigned int nbuckets; - struct pnfs_commit_bucket *buckets; }; struct nfs41_state_protection { @@ -1396,22 +1394,11 @@ struct nfs41_free_stateid_res { unsigned int status; }; -static inline void -nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo) -{ - kfree(cinfo->buckets); -} - #else struct pnfs_ds_commit_info { }; -static inline void -nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo) -{ -} - #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_V4_2 -- cgit v1.2.3 From 9c455a8c1e146dac3a6d1405fe6a7096177b9546 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Mar 2020 11:13:05 -0400 Subject: NFS/pNFS: Clean up pNFS commit operations Move the pNFS commit related operations into a separate structure that can be carried by the pnfs_ds_commit_info. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 6 +- fs/nfs/filelayout/filelayout.c | 20 +++--- fs/nfs/flexfilelayout/flexfilelayout.c | 19 +++--- fs/nfs/pnfs.h | 110 +++++++++++++++++++++------------ fs/nfs/pnfs_nfs.c | 13 +--- include/linux/nfs_xdr.h | 1 + 6 files changed, 98 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 61f93a0fb0e0..51ab4627c4d6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -511,10 +511,7 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct nfs_commit_info *cinfo) { mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); -#ifdef CONFIG_NFS_V4_1 - if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) - NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); -#endif + pnfs_recover_commit_reqs(list, cinfo); nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } @@ -917,6 +914,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); nfs_start_io_direct(inode); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 854f350e2599..a13e69009f19 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand "); MODULE_DESCRIPTION("The NFSv4 file layout driver"); #define FILELAYOUT_POLL_RETRY_MAX (15*HZ) +static const struct pnfs_commit_ops filelayout_commit_ops; static loff_t filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, @@ -1045,6 +1046,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) if (flo == NULL) return NULL; pnfs_init_ds_commit_info(&flo->commit_info); + flo->commit_info.ops = &filelayout_commit_ops; return &flo->generic_hdr; } @@ -1094,6 +1096,16 @@ filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, spin_unlock(&inode->i_lock); } +static const struct pnfs_commit_ops filelayout_commit_ops = { + .setup_ds_info = filelayout_setup_ds_info, + .release_ds_info = filelayout_release_ds_info, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .search_commit_reqs = pnfs_generic_search_commit_reqs, + .commit_pagelist = filelayout_commit_pagelist, +}; static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, @@ -1108,14 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = { .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, .get_ds_info = &filelayout_get_ds_info, - .setup_ds_info = filelayout_setup_ds_info, - .release_ds_info = filelayout_release_ds_info, - .mark_request_commit = filelayout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .search_commit_reqs = pnfs_generic_search_commit_reqs, - .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1a4e36d07eab..d37883a2b51f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -32,6 +32,7 @@ static unsigned short io_maxretrans; +static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, @@ -52,6 +53,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); ffl->last_report_time = ktime_get(); + ffl->commit_info.ops = &ff_layout_commit_ops; return &ffl->generic_hdr; } else return NULL; @@ -2440,6 +2442,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server, return 0; } +static const struct pnfs_commit_ops ff_layout_commit_ops = { + .setup_ds_info = ff_layout_setup_ds_info, + .release_ds_info = ff_layout_release_ds_info, + .mark_request_commit = pnfs_layout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = ff_layout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -2455,14 +2467,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, - .setup_ds_info = ff_layout_setup_ds_info, - .release_ds_info = ff_layout_release_ds_info, .free_deviceid_node = ff_layout_free_deviceid_node, - .mark_request_commit = pnfs_layout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .commit_pagelist = ff_layout_commit_pagelist, .read_pagelist = ff_layout_read_pagelist, .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index faed9be6e479..b32025553f26 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -150,26 +150,6 @@ struct pnfs_layoutdriver_type { const struct nfs_pageio_ops *pg_write_ops; struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); - void (*setup_ds_info)(struct pnfs_ds_commit_info *, - struct pnfs_layout_segment *); - void (*release_ds_info)(struct pnfs_ds_commit_info *, - struct inode *inode); - void (*mark_request_commit) (struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - u32 ds_commit_idx); - void (*clear_request_commit) (struct nfs_page *req, - struct nfs_commit_info *cinfo); - int (*scan_commit_lists) (struct nfs_commit_info *cinfo, - int max); - void (*recover_commit_reqs) (struct list_head *list, - struct nfs_commit_info *cinfo); - struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct page *page); - int (*commit_pagelist)(struct inode *inode, - struct list_head *mds_pages, - int how, - struct nfs_commit_info *cinfo); int (*sync)(struct inode *inode, bool datasync); @@ -192,6 +172,29 @@ struct pnfs_layoutdriver_type { int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; +struct pnfs_commit_ops { + void (*setup_ds_info)(struct pnfs_ds_commit_info *, + struct pnfs_layout_segment *); + void (*release_ds_info)(struct pnfs_ds_commit_info *, + struct inode *inode); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); + struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, + struct page *page); +}; + struct pnfs_layout_hdr { refcount_t plh_refcount; atomic_t plh_outstanding; /* number of RPCs out */ @@ -461,9 +464,11 @@ static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) { - if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0) return PNFS_NOT_ATTEMPTED; - return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); + return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo); } static inline struct pnfs_ds_commit_info * @@ -476,19 +481,26 @@ pnfs_get_ds_info(struct inode *inode) return ld->get_ds_info(inode); } +static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode); + if (inode_cinfo != NULL) + fl_cinfo->ops = inode_cinfo->ops; +} + static inline void pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) { INIT_LIST_HEAD(&fl_cinfo->commits); + fl_cinfo->ops = NULL; } static inline void pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - - if (ld != NULL && ld->release_ds_info != NULL) - ld->release_ds_info(fl_cinfo, inode); + if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL) + fl_cinfo->ops->release_ds_info(fl_cinfo, inode); } static inline void @@ -501,24 +513,22 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (lseg == NULL || ld->mark_request_commit == NULL) + if (!lseg || !fl_cinfo->ops->mark_request_commit) return false; - ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx); + fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx); return true; } static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->clear_request_commit == NULL) + if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit) return false; - ld->clear_request_commit(req, cinfo); + fl_cinfo->ops->clear_request_commit(req, cinfo); return true; } @@ -526,21 +536,31 @@ static inline int pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, int max) { - if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (!fl_cinfo || fl_cinfo->nwritten == 0) return 0; - else - return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); + return fl_cinfo->ops->scan_commit_lists(cinfo, max); +} + +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo && fl_cinfo->nwritten != 0) + fl_cinfo->ops->recover_commit_reqs(head, cinfo); } static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->search_commit_reqs == NULL) + if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) return NULL; - return ld->search_commit_reqs(cinfo, page); + return fl_cinfo->ops->search_commit_reqs(cinfo, page); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -788,6 +808,11 @@ pnfs_get_ds_info(struct inode *inode) return NULL; } +static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + static inline void pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) { @@ -818,6 +843,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, return 0; } +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) +{ +} + static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 20f12f3cbe38..06df2e6663dc 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -149,17 +149,6 @@ pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo, } EXPORT_SYMBOL_GPL(pnfs_add_commit_array); -static void -pnfs_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, - struct pnfs_layout_segment *lseg) -{ - struct inode *inode = lseg->pls_layout->plh_inode; - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; - - if (ld->setup_ds_info != NULL) - ld->setup_ds_info(fl_cinfo, lseg); -} - static struct pnfs_commit_array * pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo, struct pnfs_layout_segment *lseg) @@ -170,7 +159,7 @@ pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo, array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); if (!array) { rcu_read_unlock(); - pnfs_setup_ds_info(fl_cinfo, lseg); + fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg); rcu_read_lock(); array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2903597ec88c..adbbeae9ce5b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1284,6 +1284,7 @@ struct pnfs_ds_commit_info { struct list_head commits; unsigned int nwritten; unsigned int ncommitting; + const struct pnfs_commit_ops *ops; }; struct nfs41_state_protection { -- cgit v1.2.3 From c84bea59449aaa699a0600a50f59d441cc1d4501 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 22 Mar 2020 14:47:38 -0400 Subject: NFS/pNFS: Simplify bucket layout segment reference counting Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 39 ++++++++++++++++++++------------------- include/linux/nfs_xdr.h | 3 +-- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 06df2e6663dc..abf16fc98346 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata) } EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); +static struct pnfs_layout_segment * +pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket) +{ + if (list_empty(&bucket->committing) && list_empty(&bucket->written)) { + struct pnfs_layout_segment *freeme = bucket->lseg; + bucket->lseg = NULL; + return freeme; + } + return NULL; +} + /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. * Note this must be called holding nfsi->commit_mutex @@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req, bucket = list_first_entry(&req->wb_list, struct pnfs_commit_bucket, written); - freeme = bucket->wlseg; - bucket->wlseg = NULL; + freeme = pnfs_free_bucket_lseg(bucket); } out: nfs_request_remove_commit_list(req, cinfo); @@ -103,8 +113,7 @@ pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags) for (b = &p->buckets[0]; n != 0; b++, n--) { INIT_LIST_HEAD(&b->written); INIT_LIST_HEAD(&b->committing); - b->wlseg = NULL; - b->clseg = NULL; + b->lseg = NULL; b->direct_verf.committed = NFS_INVALID_STABLE_HOW; } return p; @@ -246,12 +255,6 @@ pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - if (bucket->clseg == NULL) - bucket->clseg = pnfs_get_lseg(bucket->wlseg); - if (list_empty(src)) { - pnfs_put_lseg(bucket->wlseg); - bucket->wlseg = NULL; - } } return ret; } @@ -317,9 +320,8 @@ restart: if (!nwritten) continue; ret += nwritten; - if (list_empty(&b->written)) { - freeme = b->wlseg; - b->wlseg = NULL; + freeme = pnfs_free_bucket_lseg(b); + if (freeme) { pnfs_put_lseg(freeme); goto restart; } @@ -405,15 +407,12 @@ pnfs_bucket_get_committing(struct list_head *head, struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct pnfs_layout_segment *freeme; struct list_head *pos; list_for_each(pos, &bucket->committing) cinfo->ds->ncommitting--; list_splice_init(&bucket->committing, head); - freeme = bucket->clseg; - bucket->clseg = NULL; - return freeme; + return pnfs_free_bucket_lseg(bucket); } static struct nfs_commit_data * @@ -425,6 +424,8 @@ pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, if (!data) return NULL; data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo); + if (!data->lseg) + data->lseg = pnfs_get_lseg(bucket->lseg); return data; } @@ -1182,8 +1183,8 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, * off due to a rewrite, in which case it will be done in * pnfs_common_clear_request_commit */ - WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); - buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); + if (!buckets[ds_commit_idx].lseg) + buckets[ds_commit_idx].lseg = pnfs_get_lseg(lseg); } set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index adbbeae9ce5b..7bbb1f6fc1b1 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1265,8 +1265,7 @@ struct nfstime4 { struct pnfs_commit_bucket { struct list_head written; struct list_head committing; - struct pnfs_layout_segment *wlseg; - struct pnfs_layout_segment *clseg; + struct pnfs_layout_segment *lseg; struct nfs_writeverf direct_verf; }; -- cgit v1.2.3 From 08ca8b21f760c0ed5034a5c122092eec22ccf8f4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Apr 2020 13:04:49 -0400 Subject: NFS: Fix races nfs_page_group_destroy() vs nfs_destroy_unlinked_subrequests() When a subrequest is being detached from the subgroup, we want to ensure that it is not holding the group lock, or in the process of waiting for the group lock. Fixes: 5b2b5187fa85 ("NFS: Fix nfs_page_group_destroy() and nfs_lock_and_join_requests() race cases") Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 67 ++++++++++++++++++++++++++++++++---------------- fs/nfs/write.c | 10 ++++++-- include/linux/nfs_page.h | 2 ++ 3 files changed, 55 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index be5e209399ea..0e3f0f241d83 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -133,47 +133,70 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* - * nfs_page_group_lock - lock the head of the page group - * @req - request in group that is to be locked + * nfs_page_set_headlock - set the request PG_HEADLOCK + * @req: request that is to be locked * - * this lock must be held when traversing or modifying the page - * group list + * this lock must be held when modifying req->wb_head * * return 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req) +nfs_page_set_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - - if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) + if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags)) return 0; - set_bit(PG_CONTENDED1, &head->wb_flags); + set_bit(PG_CONTENDED1, &req->wb_flags); smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); } /* - * nfs_page_group_unlock - unlock the head of the page group - * @req - request in group that is to be unlocked + * nfs_page_clear_headlock - clear the request PG_HEADLOCK + * @req: request that is to be locked */ void -nfs_page_group_unlock(struct nfs_page *req) +nfs_page_clear_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - smp_mb__before_atomic(); - clear_bit(PG_HEADLOCK, &head->wb_flags); + clear_bit(PG_HEADLOCK, &req->wb_flags); smp_mb__after_atomic(); - if (!test_bit(PG_CONTENDED1, &head->wb_flags)) + if (!test_bit(PG_CONTENDED1, &req->wb_flags)) return; - wake_up_bit(&head->wb_flags, PG_HEADLOCK); + wake_up_bit(&req->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req: request in group that is to be locked + * + * this lock must be held when traversing or modifying the page + * group list + * + * return 0 on success, < 0 on error + */ +int +nfs_page_group_lock(struct nfs_page *req) +{ + int ret; + + ret = nfs_page_set_headlock(req); + if (ret || req->wb_head == req) + return ret; + return nfs_page_set_headlock(req->wb_head); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req: request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + if (req != req->wb_head) + nfs_page_clear_headlock(req->wb_head); + nfs_page_clear_headlock(req); } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 626e99cbb50e..a6d7926b0653 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -428,22 +428,28 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, destroy_list = (subreq->wb_this_page == old_head) ? NULL : subreq->wb_this_page; + /* Note: lock subreq in order to change subreq->wb_head */ + nfs_page_set_headlock(subreq); WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ subreq->wb_this_page = subreq; + subreq->wb_head = subreq; clear_bit(PG_REMOVE, &subreq->wb_flags); /* Note: races with nfs_page_group_destroy() */ if (!kref_read(&subreq->wb_kref)) { /* Check if we raced with nfs_page_group_destroy() */ - if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) { + nfs_page_clear_headlock(subreq); nfs_free_request(subreq); + } else + nfs_page_clear_headlock(subreq); continue; } + nfs_page_clear_headlock(subreq); - subreq->wb_head = subreq; nfs_release_request(old_head); if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 0bbd587fac6a..7e9419d74b86 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -142,6 +142,8 @@ extern void nfs_unlock_and_release_request(struct nfs_page *); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern int nfs_page_set_headlock(struct nfs_page *req); +extern void nfs_page_clear_headlock(struct nfs_page *req); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); /* -- cgit v1.2.3 From a62f8e3bd836bf1abde1648a45e14afd050dbd23 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Mar 2020 11:12:16 -0400 Subject: NFS: Clean up nfs_lock_and_join_requests() Clean up nfs_lock_and_join_requests() to simplify the calculation of the range covered by the page group, taking into account the presence of mirrors. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 74 +++++++++++++++++++++++++++++++++++++++ fs/nfs/write.c | 91 +++++++++++------------------------------------- include/linux/nfs_page.h | 1 + 3 files changed, 95 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index f535a92403bf..261236157e33 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -130,6 +130,80 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) } EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); +/* + * nfs_unroll_locks - unlock all newly locked reqs and wait on @req + * @head: head request of page group, must be holding head lock + * @req: request that couldn't lock and needs to wait on the req bit lock + * + * This is a helper function for nfs_lock_and_join_requests + * returns 0 on success, < 0 on error. + */ +static void +nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) +{ + struct nfs_page *tmp; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } +} + +/* + * nfs_page_group_lock_subreq - try to lock a subrequest + * @head: head request of page group + * @subreq: request to lock + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. + */ +static int +nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) +{ + int ret; + + if (!kref_get_unless_zero(&subreq->wb_kref)) + return 0; + while (!nfs_lock_request(subreq)) { + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(head, subreq); + nfs_release_request(subreq); + return ret; + } + } + return 0; +} + +/* + * nfs_page_group_lock_subrequests - try to lock the subrequests + * @head: head request of page group + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. + */ +int nfs_page_group_lock_subrequests(struct nfs_page *head) +{ + struct nfs_page *subreq; + int ret; + + /* lock each request in the page group */ + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { + ret = nfs_page_group_lock_subreq(head, subreq); + if (ret < 0) + return ret; + } + return 0; +} + /* * nfs_page_set_headlock - set the request PG_HEADLOCK * @req: request that is to be locked diff --git a/fs/nfs/write.c b/fs/nfs/write.c index a6d7926b0653..832cf57ea442 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -379,34 +379,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } -/* - * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req - * - * this is a helper function for nfs_lock_and_join_requests - * - * @inode - inode associated with request page group, must be holding inode lock - * @head - head request of page group, must be holding head lock - * @req - request that couldn't lock and needs to wait on the req bit lock - * - * NOTE: this must be called holding page_group bit lock - * which will be released before returning. - * - * returns 0 on success, < 0 on error. - */ -static void -nfs_unroll_locks(struct inode *inode, struct nfs_page *head, - struct nfs_page *req) -{ - struct nfs_page *tmp; - - /* relinquish all the locks successfully grabbed this run */ - for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { - if (!kref_read(&tmp->wb_kref)) - continue; - nfs_unlock_and_release_request(tmp); - } -} - /* * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests * @@ -487,7 +459,7 @@ nfs_lock_and_join_requests(struct page *page) struct inode *inode = page_file_mapping(page)->host; struct nfs_page *head, *subreq; struct nfs_page *destroy_list = NULL; - unsigned int total_bytes; + unsigned int pgbase, off, bytes; int ret; try_again: @@ -520,49 +492,30 @@ try_again: goto release_request; /* lock each request in the page group */ - total_bytes = head->wb_bytes; + ret = nfs_page_group_lock_subrequests(head); + if (ret < 0) + goto release_request; + + pgbase = head->wb_pgbase; + bytes = head->wb_bytes; + off = head->wb_offset; for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - - if (!kref_get_unless_zero(&subreq->wb_kref)) { - if (subreq->wb_offset == head->wb_offset + total_bytes) - total_bytes += subreq->wb_bytes; - continue; - } - - while (!nfs_lock_request(subreq)) { - /* - * Unlock page to allow nfs_page_group_sync_on_bit() - * to succeed - */ - nfs_page_group_unlock(head); - ret = nfs_wait_on_request(subreq); - if (!ret) - ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unroll_locks(inode, head, subreq); - nfs_release_request(subreq); - goto release_request; - } - } - /* - * Subrequests are always contiguous, non overlapping - * and in order - but may be repeated (mirrored writes). - */ - if (subreq->wb_offset == (head->wb_offset + total_bytes)) { - /* keep track of how many bytes this group covers */ - total_bytes += subreq->wb_bytes; - } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || - ((subreq->wb_offset + subreq->wb_bytes) > - (head->wb_offset + total_bytes)))) { - nfs_page_group_unlock(head); - nfs_unroll_locks(inode, head, subreq); - nfs_unlock_and_release_request(subreq); - ret = -EIO; - goto release_request; + /* Subrequests should always form a contiguous range */ + if (pgbase > subreq->wb_pgbase) { + off -= pgbase - subreq->wb_pgbase; + bytes += pgbase - subreq->wb_pgbase; + pgbase = subreq->wb_pgbase; } + bytes = max(subreq->wb_pgbase + subreq->wb_bytes + - pgbase, bytes); } + /* Set the head request's range to cover the former page group */ + head->wb_pgbase = pgbase; + head->wb_bytes = bytes; + head->wb_offset = off; + /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ subreq = head; @@ -576,10 +529,6 @@ try_again: /* destroy list will be terminated by head */ destroy_list = head->wb_this_page; head->wb_this_page = head; - - /* change head request to cover whole range that - * the former page group covered */ - head->wb_bytes = total_bytes; } /* Postpone destruction of this request */ diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 7e9419d74b86..dd205bc6bc58 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -139,6 +139,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); +extern int nfs_page_group_lock_subrequests(struct nfs_page *head); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); -- cgit v1.2.3 From e00ed89d7bd59c4ae49d6aeeee567187b1357a4b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Mar 2020 12:40:47 -0400 Subject: NFS: Refactor nfs_lock_and_join_requests() Refactor nfs_lock_and_join_requests() in order to separate out the subrequest merging into its own function nfs_lock_and_join_group() that can be used by O_DIRECT. Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 26 +++++++- fs/nfs/write.c | 164 ++++++++++++++++++++++++++++------------------- include/linux/nfs_page.h | 1 + 3 files changed, 123 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b9805d1dac75..f61f96603df7 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -130,6 +130,25 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) } EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); +/* + * nfs_page_lock_head_request - page lock the head of the page group + * @req: any member of the page group + */ +struct nfs_page * +nfs_page_group_lock_head(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + while (!nfs_lock_request(head)) { + int ret = nfs_wait_on_request(head); + if (ret < 0) + return ERR_PTR(ret); + } + if (head != req) + kref_get(&head->wb_kref); + return head; +} + /* * nfs_unroll_locks - unlock all newly locked reqs and wait on @req * @head: head request of page group, must be holding head lock @@ -186,14 +205,16 @@ nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) * @head: head request of page group * * This is a helper function for nfs_lock_and_join_requests which - * must be called with the head request and page group both locked. - * On error, it returns with the page group unlocked. + * must be called with the head request locked. */ int nfs_page_group_lock_subrequests(struct nfs_page *head) { struct nfs_page *subreq; int ret; + ret = nfs_page_group_lock(head); + if (ret < 0) + return ret; /* lock each request in the page group */ for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { @@ -201,6 +222,7 @@ int nfs_page_group_lock_subrequests(struct nfs_page *head) if (ret < 0) return ret; } + nfs_page_group_unlock(head); return 0; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 832cf57ea442..63b64333c3ea 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static int +nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +{ + int ret; + + if (!test_bit(PG_REMOVE, &req->wb_flags)) + return 0; + ret = nfs_page_group_lock(req); + if (ret) + return ret; + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); + nfs_page_group_unlock(req); + return 0; +} + static struct nfs_page * nfs_page_private_request(struct page *page) { @@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page) return req; } +static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *req, *head; + int ret; + + for (;;) { + req = nfs_page_find_head_request(page); + if (!req) + return req; + head = nfs_page_group_lock_head(req); + if (head != req) + nfs_release_request(req); + if (IS_ERR(head)) + return head; + ret = nfs_cancel_remove_inode(head, inode); + if (ret < 0) { + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); + } + /* Ensure that nobody removed the request before we locked it */ + if (head == nfs_page_private_request(page)) + break; + if (PageSwapCache(page)) + break; + nfs_unlock_and_release_request(head); + } + return head; +} + /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { @@ -436,65 +491,22 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, } /* - * nfs_lock_and_join_requests - join all subreqs to the head req and return - * a locked reference, cancelling any pending - * operations for this page. - * - * @page - the page used to lookup the "page group" of nfs_page structures + * nfs_join_page_group - destroy subrequests of the head req + * @head: the page used to lookup the "page group" of nfs_page structures + * @inode: Inode to which the request belongs. * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations * and finally updating the head request to cover the whole range covered by * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. - * - * Returns a locked, referenced pointer to the head request - which after - * this call is guaranteed to be the only request associated with the page. - * Returns NULL if no requests are found for @page, or a ERR_PTR if an - * error was encountered. */ -static struct nfs_page * -nfs_lock_and_join_requests(struct page *page) +static void +nfs_join_page_group(struct nfs_page *head, struct inode *inode) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *head, *subreq; + struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; unsigned int pgbase, off, bytes; - int ret; - -try_again: - /* - * A reference is taken only on the head request which acts as a - * reference to the whole page group - the group will not be destroyed - * until the head reference is released. - */ - head = nfs_page_find_head_request(page); - if (!head) - return NULL; - - /* lock the page head first in order to avoid an ABBA inefficiency */ - if (!nfs_lock_request(head)) { - ret = nfs_wait_on_request(head); - nfs_release_request(head); - if (ret < 0) - return ERR_PTR(ret); - goto try_again; - } - - /* Ensure that nobody removed the request before we locked it */ - if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { - nfs_unlock_and_release_request(head); - goto try_again; - } - - ret = nfs_page_group_lock(head); - if (ret < 0) - goto release_request; - - /* lock each request in the page group */ - ret = nfs_page_group_lock_subrequests(head); - if (ret < 0) - goto release_request; pgbase = head->wb_pgbase; bytes = head->wb_bytes; @@ -531,30 +543,50 @@ try_again: head->wb_this_page = head; } - /* Postpone destruction of this request */ - if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { - set_bit(PG_INODE_REF, &head->wb_flags); - kref_get(&head->wb_kref); - atomic_long_inc(&NFS_I(inode)->nrequests); - } + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); +} - nfs_page_group_unlock(head); +/* + * nfs_lock_and_join_requests - join all subreqs to the head req + * @page: the page used to lookup the "page group" of nfs_page structures + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *head; + int ret; - nfs_destroy_unlinked_subrequests(destroy_list, head, inode); + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_find_and_lock_page_request(page); + if (IS_ERR_OR_NULL(head)) + return head; - /* Did we lose a race with nfs_inode_remove_request()? */ - if (!(PagePrivate(page) || PageSwapCache(page))) { + /* lock each request in the page group */ + ret = nfs_page_group_lock_subrequests(head); + if (ret < 0) { nfs_unlock_and_release_request(head); - return NULL; + return ERR_PTR(ret); } - /* still holds ref on head from nfs_page_find_head_request - * and still has lock on head from lock loop */ - return head; + nfs_join_page_group(head, inode); -release_request: - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + return head; } static void nfs_write_error(struct nfs_page *req, int error) diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index dd205bc6bc58..99198c039bd6 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -139,6 +139,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); +extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req); extern int nfs_page_group_lock_subrequests(struct nfs_page *head); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); -- cgit v1.2.3 From ed5d588fe47feef290f271022820e255d8371561 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 30 Mar 2020 20:57:49 -0400 Subject: NFS: Try to join page groups before an O_DIRECT retransmission If we have to retransmit requests, try to join their page groups first. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 20 ++++++++++++++++++++ fs/nfs/write.c | 2 +- include/linux/nfs_page.h | 1 + 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8074304fd5b4..a57e7c72c7f4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -505,6 +505,24 @@ out: return result; } +static void +nfs_direct_join_group(struct list_head *list, struct inode *inode) +{ + struct nfs_page *req, *next; + + list_for_each_entry(req, list, wb_list) { + if (req->wb_head != req || req->wb_this_page == req) + continue; + for (next = req->wb_this_page; + next != req->wb_head; + next = next->wb_this_page) { + nfs_list_remove_request(next); + nfs_release_request(next); + } + nfs_join_page_group(req, inode); + } +} + static void nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, @@ -527,6 +545,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + nfs_direct_join_group(&reqs, dreq->inode); + dreq->count = 0; dreq->max_count = 0; list_for_each_entry(req, &reqs, wb_list) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 63b64333c3ea..df4b87c30ac9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -501,7 +501,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -static void +void nfs_join_page_group(struct nfs_page *head, struct inode *inode) { struct nfs_page *subreq; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 99198c039bd6..c32c15216da3 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -141,6 +141,7 @@ extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req); extern int nfs_page_group_lock_subrequests(struct nfs_page *head); +extern void nfs_join_page_group(struct nfs_page *head, struct inode *inode); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); -- cgit v1.2.3 From 93ce4af774bc3d8a72ce2271d03241c96383629d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 6 Apr 2020 13:39:29 -0400 Subject: NFS: Clean up process of marking inode stale. Instead of the various open coded calls to set the NFS_INO_STALE bit and call nfs_zap_caches(), consolidate them into a single function nfs_set_inode_stale(). Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 +++-- fs/nfs/inode.c | 18 +++++++++++++----- fs/nfs/nfstrace.h | 1 + fs/nfs/read.c | 2 +- include/linux/nfs_fs.h | 1 + 5 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f14184d0ba82..d729d8311c7e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2669,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { if (status == -ESTALE) { - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto out; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a10fb87c6ac3..b9d0921cb4fe 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -62,7 +62,6 @@ /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; -static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; @@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime); * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ -static void nfs_invalidate_inode(struct inode *inode) +static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); + trace_nfs_set_inode_stale(inode); +} + +void nfs_set_inode_stale(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_inode_stale_locked(inode); + spin_unlock(&inode->i_lock); } struct nfs_find_desc { @@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = 0; break; case -ESTALE: - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto err_out; } @@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ - nfs_invalidate_inode(inode); + nfs_set_inode_stale_locked(inode); return -ESTALE; } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a9588d19a5ae..7e7a97ae21ed 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, int error \ ), \ TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale); DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 34bb9add2302..13b22e898116 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task, trace_nfs_readpage_done(task, hdr); if (task->tk_status == -ESTALE) { - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); nfs_mark_for_revalidate(inode); } return 0; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 5d5b91e54f73..73eda45f1cfd 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -354,6 +354,7 @@ static inline unsigned long nfs_save_change_attribute(struct inode *dir) extern int nfs_sync_mapping(struct address_space *mapping); extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping); extern void nfs_zap_caches(struct inode *); +extern void nfs_set_inode_stale(struct inode *inode); extern void nfs_invalidate_atime(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *); -- cgit v1.2.3