From 2412e927604e0af7e17ae4b688b85a1e87e378fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Aug 2017 11:59:49 -0400 Subject: sunrpc: Const-ify instances of struct svc_xprt_ops Close an attack vector by moving the arrays of server-side transport methods to read-only memory. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 6 +++--- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2b720fa35c4f..b434f524d677 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -687,7 +687,7 @@ static struct svc_xprt *svc_udp_create(struct svc_serv *serv, return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags); } -static struct svc_xprt_ops svc_udp_ops = { +static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, @@ -1229,7 +1229,7 @@ static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt) { } -static struct svc_xprt_ops svc_tcp_bc_ops = { +static const struct svc_xprt_ops svc_tcp_bc_ops = { .xpo_create = svc_bc_tcp_create, .xpo_detach = svc_bc_tcp_sock_detach, .xpo_free = svc_bc_sock_free, @@ -1263,7 +1263,7 @@ static void svc_cleanup_bc_xprt_sock(void) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -static struct svc_xprt_ops svc_tcp_ops = { +static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e660d4965b18..2aa8473dcc97 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -70,7 +70,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt); static int svc_rdma_secure_port(struct svc_rqst *); static void svc_rdma_kill_temp_xprt(struct svc_xprt *); -static struct svc_xprt_ops svc_rdma_ops = { +static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, @@ -98,7 +98,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, static void svc_rdma_bc_detach(struct svc_xprt *); static void svc_rdma_bc_free(struct svc_xprt *); -static struct svc_xprt_ops svc_rdma_bc_ops = { +static const struct svc_xprt_ops svc_rdma_bc_ops = { .xpo_create = svc_rdma_bc_create, .xpo_detach = svc_rdma_bc_detach, .xpo_free = svc_rdma_bc_free, -- cgit v1.2.3 From afea5657c20b11ec9f895ac5cc33b560fb1e0276 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Aug 2017 12:00:06 -0400 Subject: sunrpc: Const-ify struct sv_serv_ops Close an attack vector by moving the arrays of per-server methods to read-only memory. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 2 +- fs/nfs/callback.c | 10 +++++----- fs/nfsd/nfssvc.c | 2 +- include/linux/sunrpc/svc.h | 6 +++--- net/sunrpc/svc.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 726b6cecf430..b995bdc13976 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -396,7 +396,7 @@ out_rqst: return error; } -static struct svc_serv_ops lockd_sv_ops = { +static const struct svc_serv_ops lockd_sv_ops = { .svo_shutdown = svc_rpcb_cleanup, .svo_enqueue_xprt = svc_xprt_do_enqueue, }; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 34323877ec13..2cddf7f437e6 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -226,26 +226,26 @@ err_bind: return ret; } -static struct svc_serv_ops nfs40_cb_sv_ops = { +static const struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) -static struct svc_serv_ops nfs41_cb_sv_ops = { +static const struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; -static struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = &nfs41_cb_sv_ops, }; #else -static struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = NULL, }; @@ -254,8 +254,8 @@ static struct svc_serv_ops *nfs4_cb_sv_ops[] = { static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + const struct svc_serv_ops *sv_ops; struct svc_serv *serv; - struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 063ae7de2c12..7e3af3ef0917 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -475,7 +475,7 @@ static int nfsd_get_default_max_blksize(void) return ret; } -static struct svc_serv_ops nfsd_thread_sv_ops = { +static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_shutdown = nfsd_last_thread, .svo_function = nfsd, .svo_enqueue_xprt = svc_xprt_do_enqueue, diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index a3f8af9bd543..38f561b2dda3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -99,7 +99,7 @@ struct svc_serv { unsigned int sv_nrpools; /* number of thread pools */ struct svc_pool * sv_pools; /* array of thread pools */ - struct svc_serv_ops *sv_ops; /* server operations */ + const struct svc_serv_ops *sv_ops; /* server operations */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) struct list_head sv_cb_list; /* queue for callback requests * that arrive over the same @@ -465,7 +465,7 @@ int svc_rpcb_setup(struct svc_serv *serv, struct net *net); void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net); int svc_bind(struct svc_serv *serv, struct net *net); struct svc_serv *svc_create(struct svc_program *, unsigned int, - struct svc_serv_ops *); + const struct svc_serv_ops *); struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, @@ -475,7 +475,7 @@ void svc_exit_thread(struct svc_rqst *); unsigned int svc_pool_map_get(void); void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - struct svc_serv_ops *); + const struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 85ce0db5b0a6..aa04666f929d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -421,7 +421,7 @@ __svc_init_bc(struct svc_serv *serv) */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - struct svc_serv_ops *ops) + const struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int vers; @@ -486,7 +486,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - struct svc_serv_ops *ops) + const struct svc_serv_ops *ops) { return __svc_create(prog, bufsize, /*npools*/1, ops); } @@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - struct svc_serv_ops *ops) + const struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); -- cgit v1.2.3 From 7075a867ce5748007b299bd28f12754cd51719d4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 1 Aug 2017 12:00:14 -0400 Subject: svcrdma: Clean up svc_rdma_build_read_chunk() Dan Carpenter observed that the while() loop in svc_rdma_build_read_chunk() does not document the assumption that the loop interior is always executed at least once. Defensive: the function now returns -EINVAL if this assumption fails. Suggested-by: Dan Carpenter Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 933f79bed270..1f34fae7aec0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -660,19 +660,21 @@ out_initerr: return -EIO; } +/* Walk the segments in the Read chunk starting at @p and construct + * RDMA Read operations to pull the chunk to the server. + */ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { int ret; + ret = -EINVAL; info->ri_chunklen = 0; - while (*p++ != xdr_zero) { + while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { u32 rs_handle, rs_length; u64 rs_offset; - if (be32_to_cpup(p++) != info->ri_position) - break; rs_handle = be32_to_cpup(p++); rs_length = be32_to_cpup(p++); p = xdr_decode_hyper(p, &rs_offset); -- cgit v1.2.3 From 193bcb7b3719a315814dce40fc8dcd1af786ea64 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 18 Aug 2017 11:12:35 -0400 Subject: svcrdma: Populate tail iovec when receiving So that NFS WRITE payloads can eventually be placed directly into a file's page cache, enable the RPC-over-RDMA transport to present these payloads in the xdr_buf's page list, while placing trailing content (such as a GETATTR operation) in the xdr_buf's tail. After this change, the RPC-over-RDMA's "copy tail" hack, added by commit a97c331f9aa9 ("svcrdma: Handle additional inline content"), is no longer needed and can be removed. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 108 ++++++-------------------------------- 1 file changed, 15 insertions(+), 93 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 1f34fae7aec0..7dcda4597057 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -691,78 +691,6 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, return ret; } -/* If there is inline content following the Read chunk, append it to - * the page list immediately following the data payload. This has to - * be done after the reader function has determined how many pages - * were consumed for RDMA Read. - * - * On entry, ri_pageno and ri_pageoff point directly to the end of the - * page list. On exit, both have been updated to the new "next byte". - * - * Assumptions: - * - Inline content fits entirely in rq_pages[0] - * - Trailing content is only a handful of bytes - */ -static int svc_rdma_copy_tail(struct svc_rqst *rqstp, - struct svc_rdma_read_info *info) -{ - struct svc_rdma_op_ctxt *head = info->ri_readctxt; - unsigned int tail_length, remaining; - u8 *srcp, *destp; - - /* Assert that all inline content fits in page 0. This is an - * implementation limit, not a protocol limit. - */ - if (head->arg.head[0].iov_len > PAGE_SIZE) { - pr_warn_once("svcrdma: too much trailing inline content\n"); - return -EINVAL; - } - - srcp = head->arg.head[0].iov_base; - srcp += info->ri_position; - tail_length = head->arg.head[0].iov_len - info->ri_position; - remaining = tail_length; - - /* If there is room on the last page in the page list, try to - * fit the trailing content there. - */ - if (info->ri_pageoff > 0) { - unsigned int len; - - len = min_t(unsigned int, remaining, - PAGE_SIZE - info->ri_pageoff); - destp = page_address(rqstp->rq_pages[info->ri_pageno]); - destp += info->ri_pageoff; - - memcpy(destp, srcp, len); - srcp += len; - destp += len; - info->ri_pageoff += len; - remaining -= len; - - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; - } - } - - /* Otherwise, a fresh page is needed. */ - if (remaining) { - head->arg.pages[info->ri_pageno] = - rqstp->rq_pages[info->ri_pageno]; - head->count++; - - destp = page_address(rqstp->rq_pages[info->ri_pageno]); - memcpy(destp, srcp, remaining); - info->ri_pageoff += remaining; - } - - head->arg.page_len += tail_length; - head->arg.len += tail_length; - head->arg.buflen += tail_length; - return 0; -} - /* Construct RDMA Reads to pull over a normal Read chunk. The chunk * data lands in the page list of head->arg.pages. * @@ -787,34 +715,28 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, if (ret < 0) goto out; - /* Read chunk may need XDR round-up (see RFC 5666, s. 3.7). + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. */ - if (info->ri_chunklen & 3) { - u32 padlen = 4 - (info->ri_chunklen & 3); - - info->ri_chunklen += padlen; + head->arg.tail[0].iov_base = + head->arg.head[0].iov_base + info->ri_position; + head->arg.tail[0].iov_len = + head->arg.head[0].iov_len - info->ri_position; + head->arg.head[0].iov_len = info->ri_position; - /* NB: data payload always starts on XDR alignment, - * thus the pad can never contain a page boundary. - */ - info->ri_pageoff += padlen; - if (info->ri_pageoff == PAGE_SIZE) { - info->ri_pageno++; - info->ri_pageoff = 0; - } - } + /* Read chunk may need XDR roundup (see RFC 5666, s. 3.7). + * + * NFSv2/3 write decoders need the length of the tail to + * contain the size of the roundup padding. + */ + head->arg.tail[0].iov_len += 4 - (info->ri_chunklen & 3); head->arg.page_len = info->ri_chunklen; head->arg.len += info->ri_chunklen; head->arg.buflen += info->ri_chunklen; - if (info->ri_position < head->arg.head[0].iov_len) { - ret = svc_rdma_copy_tail(rqstp, info); - if (ret < 0) - goto out; - } - head->arg.head[0].iov_len = info->ri_position; - out: return ret; } -- cgit v1.2.3 From 5a25bfd28c321e522dbe8083ad6219fa2a820610 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Aug 2017 15:06:06 -0400 Subject: svcrdma: Limit RQ depth Ensure that the chosen Receive Queue depth for a newly created transport does not overrun the QP WR limit of the underlying device. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2aa8473dcc97..cdb04f8a0c25 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -167,8 +167,8 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) { unsigned int i; - /* Each RPC/RDMA credit can consume a number of send - * and receive WQEs. One ctxt is allocated for each. + /* Each RPC/RDMA credit can consume one Receive and + * one Send WQE at the same time. */ i = xprt->sc_sq_depth + xprt->sc_rq_depth; @@ -742,13 +742,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); newxprt->sc_max_req_size = svcrdma_max_req_size; - newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, - svcrdma_max_requests); - newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); - newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, - svcrdma_max_bc_requests); + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; + if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing receive depth to %d\n", + dev->attrs.max_qp_wr); + newxprt->sc_rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_max_requests = newxprt->sc_rq_depth - 2; + newxprt->sc_max_bc_requests = 2; + } + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); newxprt->sc_sq_depth = newxprt->sc_rq_depth; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); -- cgit v1.2.3 From 26fb2254dd33b02a522fac42745693f5969b7d4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Aug 2017 15:06:22 -0400 Subject: svcrdma: Estimate Send Queue depth properly The rdma_rw API adjusts max_send_wr upwards during the rdma_create_qp() call. If the ULP actually wants to take advantage of these extra resources, it must increase the size of its send completion queue (created before rdma_create_qp is called) and increase its send queue accounting limit. Use the new rdma_rw_mr_factor API to figure out the correct value to use for the Send Queue and Send Completion Queue depths. And, ensure that the chosen Send Queue depth for a newly created transport does not overrun the QP WR limit of the underlying device. Lastly, there's no longer a need to carry the Send Queue depth in struct svcxprt_rdma, since the value is used only in the svc_rdma_accept() path. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index cdb04f8a0c25..5caf8e722a11 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include "xprt_rdma.h" @@ -713,7 +714,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct ib_qp_init_attr qp_attr; struct ib_device *dev; struct sockaddr *sap; - unsigned int i; + unsigned int i, ctxts; int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); @@ -754,7 +755,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); - newxprt->sc_sq_depth = newxprt->sc_rq_depth; + ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); + ctxts *= newxprt->sc_max_requests; + newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts; + if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing send depth to %d\n", + dev->attrs.max_qp_wr); + newxprt->sc_sq_depth = dev->attrs.max_qp_wr; + } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) @@ -789,8 +797,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.port_num = newxprt->sc_port_num; - qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests; - qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; + qp_attr.cap.max_rdma_ctxs = ctxts; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; @@ -858,6 +866,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); dprintk(" max_sge : %d\n", newxprt->sc_max_sge); dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); dprintk(" ord : %d\n", newxprt->sc_ord); -- cgit v1.2.3