From cee6a5372f8804f58acc87f07816f64db36718e2 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:35 +0000 Subject: RPC: remove check for impossible condition in rpc_make_runnable queue_work() only returns 0 or 1, never a negative value. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 59e599498e3..93107265256 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -299,15 +299,8 @@ static void rpc_make_runnable(struct rpc_task *task) if (rpc_test_and_set_running(task)) return; if (RPC_IS_ASYNC(task)) { - int status; - INIT_WORK(&task->u.tk_work, rpc_async_schedule); - status = queue_work(rpciod_workqueue, &task->u.tk_work); - if (status < 0) { - printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); - task->tk_status = status; - return; - } + queue_work(rpciod_workqueue, &task->u.tk_work); } else wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } -- cgit v1.2.3 From eabf5baaaaf41b6a0273043cfb06d53dca67acef Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:36 +0000 Subject: RPC: clarify rpc_run_task error handling rpc_run_task can only fail if it is not passed in a preallocated task. However, that is not at all clear with the current code. So remove several impossible to occur failure checks. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 6 ------ net/sunrpc/sched.c | 6 ------ 2 files changed, 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 57d344cf225..8b5a6b40d37 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -636,12 +636,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); - if (task->tk_status != 0) { - int ret = task->tk_status; - rpc_put_task(task); - return ERR_PTR(ret); - } - if (task->tk_action == NULL) rpc_call_start(task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 93107265256..5681c6a12d2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -836,12 +836,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) } rpc_init_task(task, setup_data); - if (task->tk_status < 0) { - int err = task->tk_status; - rpc_put_task(task); - return ERR_PTR(err); - } - task->tk_flags |= flags; dprintk("RPC: allocated task %p\n", task); return task; -- cgit v1.2.3 From cbdabc7f8bf14ca1d40ab1cb86f64b3bc09716e8 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:20 +0000 Subject: NFSv4.1: filelayout async error handler Use our own async error handler. Mark the layout as failed and retry i/o through the MDS on specified errors. Update the mds_offset in nfs_readpage_retry so that a failed short-read retry to a DS gets correctly resent through the MDS. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 1 + fs/nfs/nfs4filelayout.c | 81 +++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4proc.c | 35 ++++++++++++++++---- fs/nfs/nfs4state.c | 1 + fs/nfs/read.c | 1 + include/linux/nfs_xdr.h | 1 + include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 8 +++++ 8 files changed, 123 insertions(+), 6 deletions(-) (limited to 'net/sunrpc') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5e9df992cd7..1a3228e9ea2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -285,6 +285,7 @@ extern int nfs_migrate_page(struct address_space *, #endif /* nfs4proc.c */ +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); extern int nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 3608411653d..6a424c19abe 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -40,6 +40,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dean Hildebrand "); MODULE_DESCRIPTION("The NFSv4 file layout driver"); +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + static int filelayout_set_layoutdriver(struct nfs_server *nfss) { @@ -100,6 +102,83 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } +/* For data server errors we don't recover from */ +static void +filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + int *reset) +{ + if (task->tk_status >= 0) + return 0; + + *reset = 0; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + default: + dprintk("%s DS error. Retry through MDS %d\n", __func__, + task->tk_status); + *reset = 1; + break; + } + task->tk_status = 0; + return -EAGAIN; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_read_data *data) +{ + struct nfs_client *clp = data->ds_clp; + int reset = 0; + + dprintk("%s DS read\n", __func__); + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_read(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + + return 0; +} + /* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its @@ -109,6 +188,8 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) { struct nfs_read_data *rdata = (struct nfs_read_data *)data; + rdata->read_done_cb = filelayout_read_done_cb; + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, 0, task)) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d0962393330..1dc80903944 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3074,15 +3074,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); - dprintk("--> %s\n", __func__); - - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; @@ -3094,12 +3089,40 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + + return data->read_done_cb(task, data); +} + static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) { data->timestamp = jiffies; + data->read_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } +/* Reset the the nfs_read_data to send the read to the MDS. */ +void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + /* offsets will differ in the dense stripe case */ + data->args.offset = data->mds_offset; + data->ds_clp = NULL; + data->args.fh = NFS_FH(data->inode); + data->read_done_cb = nfs4_read_done_cb; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_read); + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 69c83637312..ab1bf5bb021 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1453,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session) { nfs4_schedule_lease_recovery(session->clp); } +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); void nfs41_handle_recall_slot(struct nfs_client *clp) { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f4d0fcffcb5..f40c7f4dc16 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -391,6 +391,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data return; /* Yes, so retry the read at the end of the data */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b63faef5052..eb0e8708435 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1020,6 +1020,7 @@ struct nfs_read_data { struct pnfs_layout_segment *lseg; struct nfs_client *ds_clp; /* pNFS data server */ const struct rpc_call_ops *mds_ops; + int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data); __u64 mds_offset; struct page *page_array[NFS_PAGEVEC_SIZE]; }; diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ef9476a36ff..db7bcaf7c5b 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -129,6 +129,7 @@ struct rpc_create_args { struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, struct rpc_program *, u32); +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8b5a6b40d37..edaf56e2ed2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -597,6 +597,14 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) } } +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt) +{ + rpc_task_release_client(task); + rpc_task_set_client(task, clnt); +} +EXPORT_SYMBOL_GPL(rpc_task_reset_client); + + static void rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) { -- cgit v1.2.3 From bd7ea31b9e8a342be76e0fe8d638343886c2d8c5 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 9 Feb 2011 19:45:28 +0000 Subject: RPCRDMA: Fix to XDR page base interpretation in marshalling logic. The RPCRDMA marshalling logic assumed that xdr->page_base was an offset into the first page of xdr->page_list. It is in fact an offset into the xdr->page_list itself, that is, it selects the first page in the page_list and the offset into that page. The symptom depended in part on the rpc_memreg_strategy, if it was FRMR, or some other one-shot mapping mode, the connection would get torn down on a base and bounds error. When the badly marshalled RPC was retransmitted it would reconnect, get the error, and tear down the connection again in a loop forever. This resulted in a hung-mount. For the other modes, it would result in silent data corruption. This bug is most easily reproduced by writing more data than the filesystem has space for. This fix corrects the page_base assumption and otherwise simplifies the iov mapping logic. Signed-off-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 86 +++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 44 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 2ac3f6e8adf..554d0814c87 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -87,6 +87,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) { int len, n = 0, p; + int page_base; + struct page **ppages; if (pos == 0 && xdrbuf->head[0].iov_len) { seg[n].mr_page = NULL; @@ -95,34 +97,32 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, ++n; } - if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) { - if (n == nsegs) - return 0; - seg[n].mr_page = xdrbuf->pages[0]; - seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base; - seg[n].mr_len = min_t(u32, - PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len); - len = xdrbuf->page_len - seg[n].mr_len; + len = xdrbuf->page_len; + ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); + page_base = xdrbuf->page_base & ~PAGE_MASK; + p = 0; + while (len && n < nsegs) { + seg[n].mr_page = ppages[p]; + seg[n].mr_offset = (void *)(unsigned long) page_base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); + BUG_ON(seg[n].mr_len > PAGE_SIZE); + len -= seg[n].mr_len; ++n; - p = 1; - while (len > 0) { - if (n == nsegs) - return 0; - seg[n].mr_page = xdrbuf->pages[p]; - seg[n].mr_offset = NULL; - seg[n].mr_len = min_t(u32, PAGE_SIZE, len); - len -= seg[n].mr_len; - ++n; - ++p; - } + ++p; + page_base = 0; /* page offset only applies to first page */ } + /* Message overflows the seg array */ + if (len && n == nsegs) + return 0; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) return n; if (n == nsegs) + /* Tail remains, but we're out of segments */ return 0; seg[n].mr_page = NULL; seg[n].mr_offset = xdrbuf->tail[0].iov_base; @@ -296,6 +296,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) int copy_len; unsigned char *srcp, *destp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + int page_base; + struct page **ppages; destp = rqst->rq_svec[0].iov_base; curlen = rqst->rq_svec[0].iov_len; @@ -324,28 +326,25 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) __func__, destp + copy_len, curlen); rqst->rq_svec[0].iov_len += curlen; } - r_xprt->rx_stats.pullup_copy_count += copy_len; - npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; + + page_base = rqst->rq_snd_buf.page_base; + ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; for (i = 0; copy_len && i < npages; i++) { - if (i == 0) - curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base; - else - curlen = PAGE_SIZE; + curlen = PAGE_SIZE - page_base; if (curlen > copy_len) curlen = copy_len; dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", __func__, i, destp, copy_len, curlen); - srcp = kmap_atomic(rqst->rq_snd_buf.pages[i], - KM_SKB_SUNRPC_DATA); - if (i == 0) - memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen); - else - memcpy(destp, srcp, curlen); + srcp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA); + memcpy(destp, srcp+page_base, curlen); kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); rqst->rq_svec[0].iov_len += curlen; destp += curlen; copy_len -= curlen; + page_base = 0; } /* header now contains entire send message */ return pad; @@ -606,6 +605,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; + struct page **ppages; + int page_base; curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) { /* write chunk header fixup */ @@ -624,32 +625,29 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) olen = copy_len; i = 0; rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; + page_base = rqst->rq_rcv_buf.page_base; + ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + if (copy_len && rqst->rq_rcv_buf.page_len) { - npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base + + npages = PAGE_ALIGN(page_base + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; for (; i < npages; i++) { - if (i == 0) - curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base; - else - curlen = PAGE_SIZE; + curlen = PAGE_SIZE - page_base; if (curlen > copy_len) curlen = copy_len; dprintk("RPC: %s: page %d" " srcp 0x%p len %d curlen %d\n", __func__, i, srcp, copy_len, curlen); - destp = kmap_atomic(rqst->rq_rcv_buf.pages[i], - KM_SKB_SUNRPC_DATA); - if (i == 0) - memcpy(destp + rqst->rq_rcv_buf.page_base, - srcp, curlen); - else - memcpy(destp, srcp, curlen); - flush_dcache_page(rqst->rq_rcv_buf.pages[i]); + destp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA); + memcpy(destp + page_base, srcp, curlen); + flush_dcache_page(ppages[i]); kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); srcp += curlen; copy_len -= curlen; if (copy_len == 0) break; + page_base = 0; } rqst->rq_rcv_buf.page_len = olen - copy_len; } else -- cgit v1.2.3 From 5c635e09cec0feeeb310968e51dad01040244851 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 9 Feb 2011 19:45:34 +0000 Subject: RPCRDMA: Fix FRMR registration/invalidate handling. When the rpc_memreg_strategy is 5, FRMR are used to map RPC data. This mode uses an FRMR to map the RPC data, then invalidates (i.e. unregisers) the data in xprt_rdma_free. These FRMR are used across connections on the same mount, i.e. if the connection goes away on an idle timeout and reconnects later, the FRMR are not destroyed and recreated. This creates a problem for transport errors because the WR that invalidate an FRMR may be flushed (i.e. fail) leaving the FRMR valid. When the FRMR is later used to map an RPC it will fail, tearing down the transport and starting over. Over time, more and more of the FRMR pool end up in the wrong state resulting in seemingly random disconnects. This fix keeps track of the FRMR state explicitly by setting it's state based on the successful completion of a reg/inv WR. If the FRMR is ever used and found to be in the wrong state, an invalidate WR is prepended, re-syncing the FRMR state and avoiding the connection loss. Signed-off-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 52 ++++++++++++++++++++++++++++++++++------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 45 insertions(+), 8 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5f4c7b3bc71..570f08dc0b0 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -144,6 +144,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) static inline void rpcrdma_event_process(struct ib_wc *wc) { + struct rpcrdma_mw *frmr; struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long) wc->wr_id; @@ -154,15 +155,23 @@ void rpcrdma_event_process(struct ib_wc *wc) return; if (IB_WC_SUCCESS != wc->status) { - dprintk("RPC: %s: %s WC status %X, connection lost\n", - __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", - wc->status); + dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", + __func__, wc->opcode, wc->status); rep->rr_len = ~0U; - rpcrdma_schedule_tasklet(rep); + if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) + rpcrdma_schedule_tasklet(rep); return; } switch (wc->opcode) { + case IB_WC_FAST_REG_MR: + frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + frmr->r.frmr.state = FRMR_IS_VALID; + break; + case IB_WC_LOCAL_INV: + frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + frmr->r.frmr.state = FRMR_IS_INVALID; + break; case IB_WC_RECV: rep->rr_len = wc->byte_len; ib_dma_sync_single_for_cpu( @@ -1450,6 +1459,11 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) seg->mr_dma = ib_dma_map_single(ia->ri_id->device, seg->mr_offset, seg->mr_dmalen, seg->mr_dir); + if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { + dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", + __func__, + seg->mr_dma, seg->mr_offset, seg->mr_dmalen); + } } static void @@ -1469,7 +1483,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr frmr_wr, *bad_wr; + struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; + u8 key; int len, pageoff; int i, rc; @@ -1484,6 +1499,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia, seg, writing); seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; len += seg->mr_len; + BUG_ON(seg->mr_len > PAGE_SIZE); ++seg; ++i; /* Check for holes */ @@ -1494,26 +1510,45 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, dprintk("RPC: %s: Using frmr %p to map %d segments\n", __func__, seg1->mr_chunk.rl_mw, i); + if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { + dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", + __func__, + seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); + /* Invalidate before using. */ + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.next = &frmr_wr; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = IB_SEND_SIGNALED; + invalidate_wr.ex.invalidate_rkey = + seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + post_wr = &invalidate_wr; + } else + post_wr = &frmr_wr; + /* Bump the key */ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.send_flags = IB_SEND_SIGNALED; frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; frmr_wr.wr.fast_reg.page_list_len = i; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + BUG_ON(frmr_wr.wr.fast_reg.length < len); frmr_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," @@ -1542,8 +1577,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_unmap_one(ia, seg++); memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c7a7eba991b..cae761a8536 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -164,6 +164,7 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ struct { struct ib_fast_reg_page_list *fr_pgl; struct ib_mr *fr_mr; + enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; } frmr; } r; struct list_head mw_list; -- cgit v1.2.3 From f8628220bb395104697be9c447c1085846dfc97c Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Thu, 3 Mar 2011 00:51:41 +0000 Subject: gss:krb5 only include enctype numbers in gm_upcall_enctypes Make the value in gm_upcall_enctypes just the enctype values. This allows the values to be used more easily elsewhere. Signed-off-by: Kevin Coffman Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 45dbf1521b9..f3914d0c507 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -417,7 +417,7 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { - len = sprintf(p, mech->gm_upcall_enctypes); + len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes); p += len; gss_msg->msg.len += len; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index f375decc024..9022f0a6503 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -750,7 +750,7 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", + .gm_upcall_enctypes = "18,17,16,23,3,1,2", }; static int __init init_kerberos_module(void) -- cgit v1.2.3 From 4d4a76f3309edc671918a767b336492fbc80a16d Mon Sep 17 00:00:00 2001 From: "j223yang@asset.uwaterloo.ca" Date: Thu, 10 Mar 2011 12:40:28 -0500 Subject: xprt: remove redundant null check 'req' is dereferenced before checked for NULL. The patch simply removes the check. Signed-off-by: Jinqiu Yang Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 856274d7e85..8bdcdbe07b9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -202,10 +202,9 @@ int xprt_reserve_xprt(struct rpc_task *task) goto out_sleep; } xprt->snd_task = task; - if (req) { - req->rq_bytes_sent = 0; - req->rq_ntrans++; - } + req->rq_bytes_sent = 0; + req->rq_ntrans++; + return 1; out_sleep: -- cgit v1.2.3 From 986d4abbddf9e76184f6cabf66654ea8e61bcde5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 15 Mar 2011 17:11:59 -0700 Subject: sunrpc: fix printk format warning Fix printk format build warning: net/sunrpc/xprtrdma/verbs.c:1463: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'dma_addr_t' Signed-off-by: Randy Dunlap Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 570f08dc0b0..d4297dc43dc 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1462,7 +1462,8 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", __func__, - seg->mr_dma, seg->mr_offset, seg->mr_dmalen); + (unsigned long long)seg->mr_dma, + seg->mr_offset, seg->mr_dmalen); } } -- cgit v1.2.3 From e020c6800c9621a77223bf2c1ff68180e41e8ebf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Mar 2011 19:56:30 -0400 Subject: SUNRPC: Ensure we always run the tk_callback before tk_action This fixes a race in which the task->tk_callback() puts the rpc_task to sleep, setting a new callback. Under certain circumstances, the current code may end up executing the task->tk_action before it gets round to the callback. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- net/sunrpc/sched.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5681c6a12d2..2e9387b2384 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -630,14 +630,12 @@ static void __rpc_execute(struct rpc_task *task) save_callback = task->tk_callback; task->tk_callback = NULL; save_callback(task); - } - - /* - * Perform the next FSM step. - * tk_action may be NULL when the task has been killed - * by someone else. - */ - if (!RPC_IS_QUEUED(task)) { + } else { + /* + * Perform the next FSM step. + * tk_action may be NULL when the task has been killed + * by someone else. + */ if (task->tk_action == NULL) break; task->tk_action(task); -- cgit v1.2.3 From a8de240a9074b72b156d9e6d53f00076e6cd5f03 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Mar 2011 19:56:30 -0400 Subject: SUNRPC: Convert struct rpc_xprt to use atomic_t counters Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +-- net/sunrpc/xprt.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net/sunrpc') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bef0f535f74..a0f998c07c6 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -146,7 +145,7 @@ enum xprt_transports { }; struct rpc_xprt { - struct kref kref; /* Reference count */ + atomic_t count; /* Reference count */ struct rpc_xprt_ops * ops; /* transport methods */ const struct rpc_timeout *timeout; /* timeout parms */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8bdcdbe07b9..4499b5a5176 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -964,7 +964,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req) xprt = kzalloc(size, GFP_KERNEL); if (xprt == NULL) goto out; - kref_init(&xprt->kref); + atomic_set(&xprt->count, 1); xprt->max_reqs = max_req; xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL); @@ -1144,13 +1144,11 @@ found: /** * xprt_destroy - destroy an RPC transport, killing off all requests. - * @kref: kref for the transport to destroy + * @xprt: transport to destroy * */ -static void xprt_destroy(struct kref *kref) +static void xprt_destroy(struct rpc_xprt *xprt) { - struct rpc_xprt *xprt = container_of(kref, struct rpc_xprt, kref); - dprintk("RPC: destroying transport %p\n", xprt); xprt->shutdown = 1; del_timer_sync(&xprt->timer); @@ -1174,7 +1172,8 @@ static void xprt_destroy(struct kref *kref) */ void xprt_put(struct rpc_xprt *xprt) { - kref_put(&xprt->kref, xprt_destroy); + if (atomic_dec_and_test(&xprt->count)) + xprt_destroy(xprt); } /** @@ -1184,6 +1183,7 @@ void xprt_put(struct rpc_xprt *xprt) */ struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) { - kref_get(&xprt->kref); - return xprt; + if (atomic_inc_not_zero(&xprt->count)) + return xprt; + return NULL; } -- cgit v1.2.3 From ba3c578de274a5438bafbce03f9225936698051c Mon Sep 17 00:00:00 2001 From: "j223yang@asset.uwaterloo.ca" Date: Wed, 16 Mar 2011 11:16:22 -0400 Subject: xprt: remove redundant check remove redundant check. Signed-off-by: Jinqiu Yang Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4499b5a5176..9494c376735 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -212,7 +212,7 @@ out_sleep: task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; - if (req && req->rq_ntrans) + if (req->rq_ntrans) rpc_sleep_on(&xprt->resend, task, NULL); else rpc_sleep_on(&xprt->sending, task, NULL); -- cgit v1.2.3 From 8e26de238fd794c8ea56a5c98bf67c40cfeb051d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 17 Mar 2011 18:54:23 +0300 Subject: RPC: killing RPC tasks races fixed RPC task RPC_TASK_QUEUED bit is set must be checked before trying to wake up task rpc_killall_tasks() because task->tk_waitqueue can not be set (equal to NULL). Also, as Trond Myklebust mentioned, such approach (instead of checking tk_waitqueue to NULL) allows us to "optimise away the call to rpc_wake_up_queued_task() altogether for those tasks that aren't queued". Here is an example of dereferencing of tk_waitqueue equal to NULL: CPU 0 CPU 1 CPU 2 -------------------- --------------------- -------------------------- nfs4_run_open_task rpc_run_task rpc_execute rpc_set_active rpc_make_runnable (waiting) rpc_async_schedule nfs4_open_prepare nfs_wait_on_sequence nfs_umount_begin rpc_killall_tasks rpc_wake_up_task rpc_wake_up_queued_task spin_lock(tk_waitqueue == NULL) BUG() rpc_sleep_on spin_lock(&q->lock) __rpc_sleep_on task->tk_waitqueue = q Signed-off-by: Stanislav Kinsbursky Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index edaf56e2ed2..e7a96e478f6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -436,7 +436,9 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) if (!(rovr->tk_flags & RPC_TASK_KILLED)) { rovr->tk_flags |= RPC_TASK_KILLED; rpc_exit(rovr, -EIO); - rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr); + if (RPC_IS_QUEUED(rovr)) + rpc_wake_up_queued_task(rovr->tk_waitqueue, + rovr); } } spin_unlock(&clnt->cl_lock); -- cgit v1.2.3