From 127b21b89f9d8ba0dc23e47b8c35d8a0bac9d6fc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Jan 2015 13:19:17 -0500 Subject: SUNRPC: Adjust rpciod workqueue parameters Increase the concurrency level for rpciod threads to allow for allocations etc that happen in the RPCSEC_GSS layer. Also note that the NFSv4 byte range locks may now need to allocate memory from inside rpciod. Add the WQ_HIGHPRI flag to improve latency guarantees while we're at it. Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index d20f2329eea3..4f65ec28d2b4 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1069,7 +1069,8 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 1); + /* Note: highpri because network receive is latency sensitive */ + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); rpciod_workqueue = wq; return rpciod_workqueue != NULL; } -- cgit v1.2.3 From c4a7ca774949960064dac11b326908f28407e8c3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Jan 2015 14:50:56 -0500 Subject: SUNRPC: Allow waiting on memory allocation We should be safe now, as long as we don't do GFP_IO or higher allocations Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 4f65ec28d2b4..b91fd9c597b4 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -844,10 +844,10 @@ static void rpc_async_schedule(struct work_struct *work) void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN; if (RPC_IS_SWAPPER(task)) - gfp |= __GFP_MEMALLOC; + gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; size += sizeof(struct rpc_buffer); if (size <= RPC_BUFFER_MAXSIZE) -- cgit v1.2.3 From 8502427ccd9500cefc1ad47655371f9121934845 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:04 -0500 Subject: xprtrdma: human-readable completion status Make it easier to grep the system log for specific error conditions. The wc.opcode field is not included because opcode numbers are sparse, and because wc.opcode is not necessarily valid when completion reports an error. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 70 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c98e40643910..56f705d63d5c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -173,18 +173,54 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) } } +static const char * const wc_status[] = { + "success", + "local length error", + "local QP operation error", + "local EE context operation error", + "local protection error", + "WR flushed", + "memory management operation error", + "bad response error", + "local access error", + "remote invalid request error", + "remote access error", + "remote operation error", + "transport retry counter exceeded", + "RNR retrycounter exceeded", + "local RDD violation error", + "remove invalid RD request", + "operation aborted", + "invalid EE context number", + "invalid EE context state", + "fatal error", + "response timeout error", + "general error", +}; + +#define COMPLETION_MSG(status) \ + ((status) < ARRAY_SIZE(wc_status) ? \ + wc_status[(status)] : "unexpected completion error") + static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + if (likely(wc->status == IB_WC_SUCCESS)) + return; - dprintk("RPC: %s: frmr %p status %X opcode %d\n", - __func__, frmr, wc->status, wc->opcode); + /* WARNING: Only wr_id and status are reliable at this point */ + if (wc->wr_id == 0ULL) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("RPC: %s: SEND: %s\n", + __func__, COMPLETION_MSG(wc->status)); + } else { + struct rpcrdma_mw *r; - if (wc->wr_id == 0ULL) - return; - if (wc->status != IB_WC_SUCCESS) - frmr->r.frmr.fr_state = FRMR_IS_STALE; + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + r->r.frmr.fr_state = FRMR_IS_STALE; + pr_err("RPC: %s: frmr %p (stale): %s\n", + __func__, r, COMPLETION_MSG(wc->status)); + } } static int @@ -248,16 +284,17 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; - dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", - __func__, rep, wc->status, wc->opcode, wc->byte_len); + /* WARNING: Only wr_id and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS) + goto out_fail; - if (wc->status != IB_WC_SUCCESS) { - rep->rr_len = ~0U; - goto out_schedule; - } + /* status == SUCCESS means all fields in wc are trustworthy */ if (wc->opcode != IB_WC_RECV) return; + dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", + __func__, rep, wc->byte_len); + rep->rr_len = wc->byte_len; ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); @@ -275,6 +312,13 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) out_schedule: list_add_tail(&rep->rr_list, sched_list); + return; +out_fail: + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("RPC: %s: rep %p: %s\n", + __func__, rep, COMPLETION_MSG(wc->status)); + rep->rr_len = ~0U; + goto out_schedule; } static int -- cgit v1.2.3 From 284f4902a632584e8d73cf7d9363f819adf7240c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:13 -0500 Subject: xprtrdma: Modernize htonl and ntohl Clean up: Replace htonl and ntohl with the be32 equivalents. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 48 +++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index df01d124936c..a6fb30b0a8cc 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -209,9 +209,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (cur_rchunk) { /* read */ cur_rchunk->rc_discrim = xdr_one; /* all read chunks have the same "position" */ - cur_rchunk->rc_position = htonl(pos); - cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); - cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); + cur_rchunk->rc_position = cpu_to_be32(pos); + cur_rchunk->rc_target.rs_handle = + cpu_to_be32(seg->mr_rkey); + cur_rchunk->rc_target.rs_length = + cpu_to_be32(seg->mr_len); xdr_encode_hyper( (__be32 *)&cur_rchunk->rc_target.rs_offset, seg->mr_base); @@ -222,8 +224,10 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, cur_rchunk++; r_xprt->rx_stats.read_chunk_count++; } else { /* write/reply */ - cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); - cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); + cur_wchunk->wc_target.rs_handle = + cpu_to_be32(seg->mr_rkey); + cur_wchunk->wc_target.rs_length = + cpu_to_be32(seg->mr_len); xdr_encode_hyper( (__be32 *)&cur_wchunk->wc_target.rs_offset, seg->mr_base); @@ -257,7 +261,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, *iptr++ = xdr_zero; /* encode a NULL reply chunk */ } else { warray->wc_discrim = xdr_one; - warray->wc_nchunks = htonl(nchunks); + warray->wc_nchunks = cpu_to_be32(nchunks); iptr = (__be32 *) cur_wchunk; if (type == rpcrdma_writech) { *iptr++ = xdr_zero; /* finish the write chunk list */ @@ -404,11 +408,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* build RDMA header in private area at front */ headerp = (struct rpcrdma_msg *) req->rl_base; - /* don't htonl XID, it's already done in request */ + /* don't byte-swap XID, it's already done in request */ headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = xdr_one; - headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); - headerp->rm_type = htonl(RDMA_MSG); + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); + headerp->rm_type = rdma_msg; /* * Chunks needed for results? @@ -482,11 +486,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) RPCRDMA_INLINE_PAD_VALUE(rqst)); if (padlen) { - headerp->rm_type = htonl(RDMA_MSGP); + headerp->rm_type = rdma_msgp; headerp->rm_body.rm_padded.rm_align = - htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); + cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); headerp->rm_body.rm_padded.rm_thresh = - htonl(RPCRDMA_INLINE_PAD_THRESH); + cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; @@ -570,7 +574,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b unsigned int i, total_len; struct rpcrdma_write_chunk *cur_wchunk; - i = ntohl(**iptrp); /* get array count */ + i = be32_to_cpu(**iptrp); if (i > max) return -1; cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); @@ -582,11 +586,11 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", __func__, - ntohl(seg->rs_length), + be32_to_cpu(seg->rs_length), (unsigned long long)off, - ntohl(seg->rs_handle)); + be32_to_cpu(seg->rs_handle)); } - total_len += ntohl(seg->rs_length); + total_len += be32_to_cpu(seg->rs_length); ++cur_wchunk; } /* check and adjust for properly terminated write chunk */ @@ -749,9 +753,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) goto repost; } headerp = (struct rpcrdma_msg *) rep->rr_base; - if (headerp->rm_vers != xdr_one) { + if (headerp->rm_vers != rpcrdma_version) { dprintk("RPC: %s: invalid version %d\n", - __func__, ntohl(headerp->rm_vers)); + __func__, be32_to_cpu(headerp->rm_vers)); goto repost; } @@ -793,7 +797,7 @@ repost: /* check for expected message types */ /* The order of some of these tests is important. */ switch (headerp->rm_type) { - case htonl(RDMA_MSG): + case rdma_msg: /* never expect read chunks */ /* never expect reply chunks (two ways to check) */ /* never expect write chunks without having offered RDMA */ @@ -832,7 +836,7 @@ repost: rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); break; - case htonl(RDMA_NOMSG): + case rdma_nomsg: /* never expect read or write chunks, always reply chunks */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || @@ -853,7 +857,7 @@ badheader: dprintk("%s: invalid rpcrdma reply header (type %d):" " chunks[012] == %d %d %d" " expected chunks <= %d\n", - __func__, ntohl(headerp->rm_type), + __func__, be32_to_cpu(headerp->rm_type), headerp->rm_body.rm_chunks[0], headerp->rm_body.rm_chunks[1], headerp->rm_body.rm_chunks[2], -- cgit v1.2.3 From 052151a9798ef7a79372fdc688018dc405a6063c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:21 -0500 Subject: xprtrdma: Display XIDs in host byte order xprtsock.c and the backchannel code display XIDs in host byte order. Follow suit in xprtrdma. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a6fb30b0a8cc..150dd7641803 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -766,7 +766,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) spin_unlock(&xprt->transport_lock); dprintk("RPC: %s: reply 0x%p failed " "to match any request xid 0x%08x len %d\n", - __func__, rep, headerp->rm_xid, rep->rr_len); + __func__, rep, be32_to_cpu(headerp->rm_xid), + rep->rr_len); repost: r_xprt->rx_stats.bad_reply_count++; rep->rr_func = rpcrdma_reply_handler; @@ -782,13 +783,14 @@ repost: spin_unlock(&xprt->transport_lock); dprintk("RPC: %s: duplicate reply 0x%p to RPC " "request 0x%p: xid 0x%08x\n", __func__, rep, req, - headerp->rm_xid); + be32_to_cpu(headerp->rm_xid)); goto repost; } dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", - __func__, rep, req, rqst, headerp->rm_xid); + __func__, rep, req, rqst, + be32_to_cpu(headerp->rm_xid)); /* from here on, the reply is no longer an orphan */ req->rl_reply = rep; -- cgit v1.2.3 From f2846481b4bf758cf7c3fe8f24b35950306f1db2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:29 -0500 Subject: xprtrdma: Clean up hdrlen Clean up: Replace naked integers with a documenting macro. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 150dd7641803..dcf5ebc3d373 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -472,7 +472,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) return -EIO; } - hdrlen = 28; /*sizeof *headerp;*/ + hdrlen = RPCRDMA_HDRLEN_MIN; padlen = 0; /* @@ -748,7 +748,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) } return; } - if (rep->rr_len < 28) { + if (rep->rr_len < RPCRDMA_HDRLEN_MIN) { dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; } @@ -830,8 +830,9 @@ repost: } else { /* else ordinary inline */ rdmalen = 0; - iptr = (__be32 *)((unsigned char *)headerp + 28); - rep->rr_len -= 28; /*sizeof *headerp;*/ + iptr = (__be32 *)((unsigned char *)headerp + + RPCRDMA_HDRLEN_MIN); + rep->rr_len -= RPCRDMA_HDRLEN_MIN; status = rep->rr_len; } /* Fix up the rpc results for upper layer */ @@ -845,7 +846,8 @@ repost: headerp->rm_body.rm_chunks[2] != xdr_one || req->rl_nchunks == 0) goto badheader; - iptr = (__be32 *)((unsigned char *)headerp + 28); + iptr = (__be32 *)((unsigned char *)headerp + + RPCRDMA_HDRLEN_MIN); rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); if (rdmalen < 0) goto badheader; -- cgit v1.2.3 From 5abefb861fd4306467813380cf21ce21d4b274ce Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:37 -0500 Subject: xprtrdma: Rename "xprt" and "rdma_connect" fields in struct rpcrdma_xprt Clean up: Use consistent field names in struct rpcrdma_xprt. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 19 ++++++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 6 +++--- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index bbd6155d3e34..ee5751326339 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -200,9 +200,9 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) static void xprt_rdma_connect_worker(struct work_struct *work) { - struct rpcrdma_xprt *r_xprt = - container_of(work, struct rpcrdma_xprt, rdma_connect.work); - struct rpc_xprt *xprt = &r_xprt->xprt; + struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, + rx_connect_worker.work); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc = 0; xprt_clear_connected(xprt); @@ -235,7 +235,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) dprintk("RPC: %s: called\n", __func__); - cancel_delayed_work_sync(&r_xprt->rdma_connect); + cancel_delayed_work_sync(&r_xprt->rx_connect_worker); xprt_clear_connected(xprt); @@ -374,7 +374,8 @@ xprt_setup_rdma(struct xprt_create *args) * connection loss notification is async. We also catch connection loss * when reaping receives. */ - INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); + INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, + xprt_rdma_connect_worker); new_ep->rep_func = rpcrdma_conn_func; new_ep->rep_xprt = xprt; @@ -434,17 +435,17 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) if (r_xprt->rx_ep.rep_connected != 0) { /* Reconnect */ - schedule_delayed_work(&r_xprt->rdma_connect, - xprt->reestablish_timeout); + schedule_delayed_work(&r_xprt->rx_connect_worker, + xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; } else { - schedule_delayed_work(&r_xprt->rdma_connect, 0); + schedule_delayed_work(&r_xprt->rx_connect_worker, 0); if (!RPC_IS_ASYNC(task)) - flush_delayed_work(&r_xprt->rdma_connect); + flush_delayed_work(&r_xprt->rx_connect_worker); } } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b799041b75bf..9a7aab31bf6e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -318,16 +318,16 @@ struct rpcrdma_stats { * during unmount. */ struct rpcrdma_xprt { - struct rpc_xprt xprt; + struct rpc_xprt rx_xprt; struct rpcrdma_ia rx_ia; struct rpcrdma_ep rx_ep; struct rpcrdma_buffer rx_buf; struct rpcrdma_create_data_internal rx_data; - struct delayed_work rdma_connect; + struct delayed_work rx_connect_worker; struct rpcrdma_stats rx_stats; }; -#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) +#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) /* Setting this to 0 ensures interoperability with early servers. -- cgit v1.2.3 From 5d410ba061c1e4bc0068ce91f2cf349998cde46c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:46 -0500 Subject: xprtrdma: Remove rpcrdma_ep::rep_ia Clean up: This field is not used. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 1 - net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 56f705d63d5c..56e14b369d42 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -825,7 +825,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, else if (ep->rep_cqinit <= 2) ep->rep_cqinit = 0; INIT_CQCOUNT(ep); - ep->rep_ia = ia; init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9a7aab31bf6e..5160a84fdb72 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -83,7 +83,6 @@ struct rpcrdma_ep { atomic_t rep_cqcount; int rep_cqinit; int rep_connected; - struct rpcrdma_ia *rep_ia; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; struct ib_sge rep_pad; /* holds zeroed pad */ -- cgit v1.2.3 From 3eb358106660195948f4e95822039c5799fc41f8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:02:54 -0500 Subject: xprtrdma: Remove rl_mr field, and the mr_chunk union Clean up: Since commit 0ac531c18323 ("xprtrdma: Remove REGISTER memory registration mode"), the rl_mr pointer is no longer used anywhere. After removal, there's only a single member of the mr_chunk union, so mr_chunk can be removed as well, in favor of a single pointer field. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 25 ++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 +---- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 56e14b369d42..1000f637edee 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1493,8 +1493,8 @@ rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) int i; for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) - rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); - rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); + rpcrdma_buffer_put_mr(&seg->rl_mw, buf); + rpcrdma_buffer_put_mr(&seg1->rl_mw, buf); } static void @@ -1580,7 +1580,7 @@ rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, list_add(&r->mw_list, stale); continue; } - req->rl_segments[i].mr_chunk.rl_mw = r; + req->rl_segments[i].rl_mw = r; if (unlikely(i-- == 0)) return req; /* Success */ } @@ -1602,7 +1602,7 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) r = list_entry(buf->rb_mws.next, struct rpcrdma_mw, mw_list); list_del(&r->mw_list); - req->rl_segments[i].mr_chunk.rl_mw = r; + req->rl_segments[i].rl_mw = r; if (unlikely(i-- == 0)) return req; /* Success */ } @@ -1842,7 +1842,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; + struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; struct ib_mr *mr = frmr->fr_mr; struct ib_send_wr fastreg_wr, *bad_wr; @@ -1931,12 +1931,12 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct ib_send_wr invalidate_wr, *bad_wr; int rc; - seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); read_lock(&ia->ri_qplock); @@ -1946,7 +1946,7 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, read_unlock(&ia->ri_qplock); if (rc) { /* Force rpcrdma_buffer_get() to retry */ - seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; dprintk("RPC: %s: failed ib_post_send for invalidate," " status %i\n", __func__, rc); } @@ -1978,8 +1978,7 @@ rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, - physaddrs, i, seg1->mr_dma); + rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); if (rc) { dprintk("RPC: %s: failed ib_map_phys_fmr " "%u@0x%llx+%i (%d)... status %i\n", __func__, @@ -1988,7 +1987,7 @@ rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, while (i--) rpcrdma_unmap_one(ia, --seg); } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; + seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; @@ -2005,7 +2004,7 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, LIST_HEAD(l); int rc; - list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); + list_add(&seg1->rl_mw->r.fmr->list, &l); rc = ib_unmap_fmr(&l); read_lock(&ia->ri_qplock); while (seg1->mr_nsegs--) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5160a84fdb72..532d58667b9d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -210,10 +210,7 @@ struct rpcrdma_mw { */ struct rpcrdma_mr_seg { /* chunk descriptors */ - union { /* chunk memory handles */ - struct ib_mr *rl_mr; /* if registered directly */ - struct rpcrdma_mw *rl_mw; /* if registered from region */ - } mr_chunk; + struct rpcrdma_mw *rl_mw; /* registered MR */ u64 mr_base; /* registration result */ u32 mr_rkey; /* registration result */ u32 mr_len; /* length of chunk or segment */ -- cgit v1.2.3 From eba8ff660b2d8b7fcd6669fcab2c025b59f66d26 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:02 -0500 Subject: xprtrdma: Move credit update to RPC reply handler Reduce work in the receive CQ handler, which can be run at hardware interrupt level, by moving the RPC/RDMA credit update logic to the RPC reply handler. This has some additional benefits: More header sanity checking is done before trusting the incoming credit value, and the receive CQ handler no longer touches the RPC/RDMA header (the CPU stalls while waiting for the header contents to be brought into the cache). This further extends work begun by commit e7ce710a8802 ("xprtrdma: Avoid deadlock when credit window is reset"). Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 10 ++++++++-- net/sunrpc/xprtrdma/verbs.c | 15 ++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index dcf5ebc3d373..d7310109b601 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -736,7 +736,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) struct rpc_xprt *xprt = rep->rr_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); __be32 *iptr; - int rdmalen, status; + int credits, rdmalen, status; unsigned long cwnd; /* Check status. If bad, signal disconnect and return rep to pool */ @@ -871,8 +871,14 @@ badheader: break; } + credits = be32_to_cpu(headerp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_max_requests) + credits = r_xprt->rx_buf.rb_max_requests; + cwnd = xprt->cwnd; - xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; + xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1000f637edee..71a071aaf0ab 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -49,6 +49,7 @@ #include #include +#include #include #include "xprt_rdma.h" @@ -298,17 +299,7 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) rep->rr_len = wc->byte_len; ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); - - if (rep->rr_len >= 16) { - struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; - unsigned int credits = ntohl(p->rm_credit); - - if (credits == 0) - credits = 1; /* don't deadlock */ - else if (credits > rep->rr_buffer->rb_max_requests) - credits = rep->rr_buffer->rb_max_requests; - atomic_set(&rep->rr_buffer->rb_credits, credits); - } + prefetch(rep->rr_base); out_schedule: list_add_tail(&rep->rr_list, sched_list); @@ -480,7 +471,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV; connected: - atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; @@ -1186,7 +1176,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); - atomic_set(&buf->rb_credits, 1); /* Need to allocate: * 1. arrays for send and recv pointers diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 532d58667b9d..3fcc92b0e3ca 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -248,7 +248,6 @@ struct rpcrdma_req { */ struct rpcrdma_buffer { spinlock_t rb_lock; /* protects indexes */ - atomic_t rb_credits; /* most recent server credits */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ struct list_head rb_all; -- cgit v1.2.3 From afadc468eb309b7c48ffdc8fa4c72acbb9991613 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:11 -0500 Subject: xprtrdma: Remove rpcrdma_ep::rep_func and ::rep_xprt Clean up: The rep_func field always refers to rpcrdma_conn_func(). rep_func should have been removed by commit b45ccfd25d50 ("xprtrdma: Remove MEMWINDOWS registration modes"). Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 +++- net/sunrpc/xprtrdma/transport.c | 2 -- net/sunrpc/xprtrdma/verbs.c | 6 +++--- net/sunrpc/xprtrdma/xprt_rdma.h | 2 -- 4 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index d7310109b601..f2eda155299a 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -695,7 +695,9 @@ rpcrdma_connect_worker(struct work_struct *work) { struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpc_xprt *xprt = ep->rep_xprt; + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; spin_lock_bh(&xprt->transport_lock); if (++xprt->connect_cookie == 0) /* maintain a reserved value */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ee5751326339..a487bde71b4a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -376,8 +376,6 @@ xprt_setup_rdma(struct xprt_create *args) */ INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - new_ep->rep_func = rpcrdma_conn_func; - new_ep->rep_xprt = xprt; xprt_rdma_format_addresses(xprt); xprt->max_payload = rpcrdma_max_payload(new_xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 71a071aaf0ab..c61bb61c4d13 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -154,7 +154,7 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - ep->rep_func(ep); + rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); } } @@ -169,7 +169,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) event->device->name, context); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - ep->rep_func(ep); + rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); } } @@ -474,7 +474,7 @@ connected: dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; - ep->rep_func(ep); + rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 3fcc92b0e3ca..657c370e48b9 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -87,8 +87,6 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct ib_sge rep_pad; /* holds zeroed pad */ struct ib_mr *rep_pad_mr; /* holds zeroed pad */ - void (*rep_func)(struct rpcrdma_ep *); - struct rpc_xprt *rep_xprt; /* for rep_func */ struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; -- cgit v1.2.3 From 5ae711a24601257f395c1f8746ac95be0cbd75e5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:19 -0500 Subject: xprtrdma: Free the pd if ib_query_qp() fails If ib_query_qp() fails or the memory registration mode isn't supported, don't leak the PD. An orphaned IB/core resource will cause IB module removal to hang. Fixes: bd7ed1d13304 ("RPC/RDMA: check selected memory registration ...") Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c61bb61c4d13..aa012a393448 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -614,7 +614,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (rc) { dprintk("RPC: %s: ib_query_device failed %d\n", __func__, rc); - goto out2; + goto out3; } if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { @@ -672,14 +672,14 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) "phys register failed with %lX\n", __func__, PTR_ERR(ia->ri_bind_mem)); rc = -ENOMEM; - goto out2; + goto out3; } break; default: printk(KERN_ERR "RPC: Unsupported memory " "registration mode: %d\n", memreg); rc = -ENOMEM; - goto out2; + goto out3; } dprintk("RPC: %s: memory registration strategy is %d\n", __func__, memreg); @@ -689,6 +689,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rwlock_init(&ia->ri_qplock); return 0; + +out3: + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; out2: rdma_destroy_id(ia->ri_id); ia->ri_id = NULL; -- cgit v1.2.3 From 7bc7972cdd1f137552ca979caa11c8acbe119ae8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:27 -0500 Subject: xprtrdma: Take struct ib_device_attr off the stack Device attributes are large, and are used in more than one place. Stash a copy in dynamically allocated memory. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 37 +++++++++++++------------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 14 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index aa012a393448..123bb04dd823 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -588,8 +588,8 @@ int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { int rc, mem_priv; - struct ib_device_attr devattr; struct rpcrdma_ia *ia = &xprt->rx_ia; + struct ib_device_attr *devattr = &ia->ri_devattr; ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { @@ -605,26 +605,21 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - /* - * Query the device to determine if the requested memory - * registration strategy is supported. If it isn't, set the - * strategy to a globally supported model. - */ - rc = ib_query_device(ia->ri_id->device, &devattr); + rc = ib_query_device(ia->ri_id->device, devattr); if (rc) { dprintk("RPC: %s: ib_query_device failed %d\n", __func__, rc); goto out3; } - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { + if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { ia->ri_have_dma_lkey = 1; ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; } if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr.device_cap_flags & + if ((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { dprintk("RPC: %s: FRMR registration " @@ -634,7 +629,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) /* Mind the ia limit on FRMR page list depth */ ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - devattr.max_fast_reg_page_list_len); + devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { @@ -736,20 +731,13 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr devattr; + struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; int rc, err; - rc = ib_query_device(ia->ri_id->device, &devattr); - if (rc) { - dprintk("RPC: %s: ib_query_device failed %d\n", - __func__, rc); - return rc; - } - /* check provider's send/recv wr limits */ - if (cdata->max_requests > devattr.max_qp_wr) - cdata->max_requests = devattr.max_qp_wr; + if (cdata->max_requests > devattr->max_qp_wr) + cdata->max_requests = devattr->max_qp_wr; ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; @@ -784,8 +772,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { - cdata->max_requests = devattr.max_qp_wr / depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * @@ -868,10 +856,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else - ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; + ep->rep_remote_cma.responder_resources = + devattr->max_qp_rd_atom; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 657c370e48b9..ec596cebc966 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -70,6 +70,7 @@ struct rpcrdma_ia { int ri_async_rc; enum rpcrdma_memreg ri_memreg_strategy; unsigned int ri_max_frmr_depth; + struct ib_device_attr ri_devattr; }; /* -- cgit v1.2.3 From ce1ab9ab47973dcff7548abda20e49add2c4ca95 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:35 -0500 Subject: xprtrdma: Take struct ib_qp_attr and ib_qp_init_attr off the stack Reduce stack footprint of the connection upcall handler function. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 15 ++++++++------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 123bb04dd823..958b372cb919 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -425,8 +425,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; #endif - struct ib_qp_attr attr; - struct ib_qp_init_attr iattr; + struct ib_qp_attr *attr = &ia->ri_qp_attr; + struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; int connstate = 0; switch (event->event) { @@ -449,12 +449,13 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; - ib_query_qp(ia->ri_id->qp, &attr, - IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, - &iattr); + ib_query_qp(ia->ri_id->qp, attr, + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, + iattr); dprintk("RPC: %s: %d responder resources" " (%d initiator)\n", - __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); + __func__, attr->max_dest_rd_atomic, + attr->max_rd_atomic); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: connstate = -ENOTCONN; @@ -487,7 +488,7 @@ connected: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (connstate == 1) { - int ird = attr.max_dest_rd_atomic; + int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; printk(KERN_INFO "rpcrdma: connection to %pI4:%u " "on %s, memreg %d slots %d ird %d%s\n", diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ec596cebc966..2b4e7787734d 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -71,6 +71,8 @@ struct rpcrdma_ia { enum rpcrdma_memreg ri_memreg_strategy; unsigned int ri_max_frmr_depth; struct ib_device_attr ri_devattr; + struct ib_qp_attr ri_qp_attr; + struct ib_qp_init_attr ri_qp_init_attr; }; /* -- cgit v1.2.3 From ac920d04a7f307bfd7633f60abe33fb626f6ec83 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:44 -0500 Subject: xprtrdma: Simplify synopsis of rpcrdma_buffer_create() Clean up: There is one call site for rpcrdma_buffer_create(). All of the arguments there are fields of an rpcrdma_xprt. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 3 +-- net/sunrpc/xprtrdma/verbs.c | 7 +++++-- net/sunrpc/xprtrdma/xprt_rdma.h | 4 +--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a487bde71b4a..808b3c52427a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -364,8 +364,7 @@ xprt_setup_rdma(struct xprt_create *args) * any inline data. Also specify any padding which will be provided * from a preregistered zero buffer. */ - rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, - &new_xprt->rx_data); + rc = rpcrdma_buffer_create(new_xprt); if (rc) goto out3; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 958b372cb919..fd71501403fd 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1161,9 +1161,11 @@ out_free: } int -rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, - struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) +rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; char *p; size_t len, rlen, wlen; int i, rc; @@ -1200,6 +1202,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * Register the zeroed pad buffer, if any. */ if (cdata->padding) { + struct rpcrdma_ep *ep = &r_xprt->rx_ep; rc = rpcrdma_register_internal(ia, p, cdata->padding, &ep->rep_pad_mr, &ep->rep_pad); if (rc) diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2b4e7787734d..5c2fac3f30b6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -354,9 +354,7 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, /* * Buffer calls - xprtrdma/verbs.c */ -int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, - struct rpcrdma_ia *, - struct rpcrdma_create_data_internal *); +int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); -- cgit v1.2.3 From 1392402c405a75de1cdc658d36c6007ea1c037de Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:03:52 -0500 Subject: xprtrdma: Refactor rpcrdma_buffer_create() and rpcrdma_buffer_destroy() Move the details of how to create and destroy rpcrdma_req and rpcrdma_rep structures into helper functions. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 148 ++++++++++++++++++++++++++++---------------- 1 file changed, 95 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index fd71501403fd..24ea6dd184e4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1075,6 +1075,69 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) } } +static struct rpcrdma_req * +rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + size_t wlen = 1 << fls(cdata->inline_wsize + + sizeof(struct rpcrdma_req)); + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_req *req; + int rc; + + rc = -ENOMEM; + req = kmalloc(wlen, GFP_KERNEL); + if (req == NULL) + goto out; + memset(req, 0, sizeof(struct rpcrdma_req)); + + rc = rpcrdma_register_internal(ia, req->rl_base, wlen - + offsetof(struct rpcrdma_req, rl_base), + &req->rl_handle, &req->rl_iov); + if (rc) + goto out_free; + + req->rl_size = wlen - sizeof(struct rpcrdma_req); + req->rl_buffer = &r_xprt->rx_buf; + return req; + +out_free: + kfree(req); +out: + return ERR_PTR(rc); +} + +static struct rpcrdma_rep * +rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + size_t rlen = 1 << fls(cdata->inline_rsize + + sizeof(struct rpcrdma_rep)); + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_rep *rep; + int rc; + + rc = -ENOMEM; + rep = kmalloc(rlen, GFP_KERNEL); + if (rep == NULL) + goto out; + memset(rep, 0, sizeof(struct rpcrdma_rep)); + + rc = rpcrdma_register_internal(ia, rep->rr_base, rlen - + offsetof(struct rpcrdma_rep, rr_base), + &rep->rr_handle, &rep->rr_iov); + if (rc) + goto out_free; + + rep->rr_buffer = &r_xprt->rx_buf; + return rep; + +out_free: + kfree(rep); +out: + return ERR_PTR(rc); +} + static int rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) { @@ -1167,7 +1230,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; char *p; - size_t len, rlen, wlen; + size_t len; int i, rc; buf->rb_max_requests = cdata->max_requests; @@ -1227,68 +1290,55 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) break; } - /* - * Allocate/init the request/reply buffers. Doing this - * using kmalloc for now -- one for each buf. - */ - wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); - rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); - dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", - __func__, wlen, rlen); - for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; struct rpcrdma_rep *rep; - req = kmalloc(wlen, GFP_KERNEL); - if (req == NULL) { + req = rpcrdma_create_req(r_xprt); + if (IS_ERR(req)) { dprintk("RPC: %s: request buffer %d alloc" " failed\n", __func__, i); - rc = -ENOMEM; + rc = PTR_ERR(req); goto out; } - memset(req, 0, sizeof(struct rpcrdma_req)); buf->rb_send_bufs[i] = req; - buf->rb_send_bufs[i]->rl_buffer = buf; - - rc = rpcrdma_register_internal(ia, req->rl_base, - wlen - offsetof(struct rpcrdma_req, rl_base), - &buf->rb_send_bufs[i]->rl_handle, - &buf->rb_send_bufs[i]->rl_iov); - if (rc) - goto out; - buf->rb_send_bufs[i]->rl_size = wlen - - sizeof(struct rpcrdma_req); - - rep = kmalloc(rlen, GFP_KERNEL); - if (rep == NULL) { + rep = rpcrdma_create_rep(r_xprt); + if (IS_ERR(rep)) { dprintk("RPC: %s: reply buffer %d alloc failed\n", __func__, i); - rc = -ENOMEM; + rc = PTR_ERR(rep); goto out; } - memset(rep, 0, sizeof(struct rpcrdma_rep)); buf->rb_recv_bufs[i] = rep; - buf->rb_recv_bufs[i]->rr_buffer = buf; - - rc = rpcrdma_register_internal(ia, rep->rr_base, - rlen - offsetof(struct rpcrdma_rep, rr_base), - &buf->rb_recv_bufs[i]->rr_handle, - &buf->rb_recv_bufs[i]->rr_iov); - if (rc) - goto out; - } - dprintk("RPC: %s: max_requests %d\n", - __func__, buf->rb_max_requests); - /* done */ + return 0; out: rpcrdma_buffer_destroy(buf); return rc; } +static void +rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) +{ + if (!rep) + return; + + rpcrdma_deregister_internal(ia, rep->rr_handle, &rep->rr_iov); + kfree(rep); +} + +static void +rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + if (!req) + return; + + rpcrdma_deregister_internal(ia, req->rl_handle, &req->rl_iov); + kfree(req); +} + static void rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) { @@ -1344,18 +1394,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) dprintk("RPC: %s: entering\n", __func__); for (i = 0; i < buf->rb_max_requests; i++) { - if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { - rpcrdma_deregister_internal(ia, - buf->rb_recv_bufs[i]->rr_handle, - &buf->rb_recv_bufs[i]->rr_iov); - kfree(buf->rb_recv_bufs[i]); - } - if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { - rpcrdma_deregister_internal(ia, - buf->rb_send_bufs[i]->rl_handle, - &buf->rb_send_bufs[i]->rl_iov); - kfree(buf->rb_send_bufs[i]); - } + if (buf->rb_recv_bufs) + rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]); + if (buf->rb_send_bufs) + rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } switch (ia->ri_memreg_strategy) { -- cgit v1.2.3 From 9128c3e794a77917a86dd5490ca2c5233a8c6fde Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:04:00 -0500 Subject: xprtrdma: Add struct rpcrdma_regbuf and helpers There are several spots that allocate a buffer via kmalloc (usually contiguously with another data structure) and then register that buffer internally. I'd like to split the buffers out of these data structures to allow the data structures to scale. Start by adding functions that can kmalloc and register a buffer, and can manage/preserve the buffer's associated ib_sge and ib_mr fields. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 55 +++++++++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 24ea6dd184e4..cdd6aacc9168 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1828,6 +1828,61 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia, return rc; } +/** + * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers + * @ia: controlling rpcrdma_ia + * @size: size of buffer to be allocated, in bytes + * @flags: GFP flags + * + * Returns pointer to private header of an area of internally + * registered memory, or an ERR_PTR. The registered buffer follows + * the end of the private header. + * + * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for + * receiving the payload of RDMA RECV operations. regbufs are not + * used for RDMA READ/WRITE operations, thus are registered only for + * LOCAL access. + */ +struct rpcrdma_regbuf * +rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + int rc; + + rc = -ENOMEM; + rb = kmalloc(sizeof(*rb) + size, flags); + if (rb == NULL) + goto out; + + rb->rg_size = size; + rb->rg_owner = NULL; + rc = rpcrdma_register_internal(ia, rb->rg_base, size, + &rb->rg_mr, &rb->rg_iov); + if (rc) + goto out_free; + + return rb; + +out_free: + kfree(rb); +out: + return ERR_PTR(rc); +} + +/** + * rpcrdma_free_regbuf - deregister and free registered buffer + * @ia: controlling rpcrdma_ia + * @rb: regbuf to be deregistered and freed + */ +void +rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + if (rb) { + rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); + kfree(rb); + } +} + /* * Wrappers for chunk registration, shared by read/write chunk code. */ diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5c2fac3f30b6..36c37c60f1fe 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -106,6 +106,44 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV + * + * The below structure appears at the front of a large region of kmalloc'd + * memory, which always starts on a good alignment boundary. + */ + +struct rpcrdma_regbuf { + size_t rg_size; + struct rpcrdma_req *rg_owner; + struct ib_mr *rg_mr; + struct ib_sge rg_iov; + __be32 rg_base[0] __attribute__ ((aligned(256))); +}; + +static inline u64 +rdmab_addr(struct rpcrdma_regbuf *rb) +{ + return rb->rg_iov.addr; +} + +static inline u32 +rdmab_length(struct rpcrdma_regbuf *rb) +{ + return rb->rg_iov.length; +} + +static inline u32 +rdmab_lkey(struct rpcrdma_regbuf *rb) +{ + return rb->rg_iov.lkey; +} + +static inline struct rpcrdma_msg * +rdmab_to_msg(struct rpcrdma_regbuf *rb) +{ + return (struct rpcrdma_msg *)rb->rg_base; +} + enum rpcrdma_chunktype { rpcrdma_noch = 0, rpcrdma_readch, @@ -372,6 +410,11 @@ int rpcrdma_register_external(struct rpcrdma_mr_seg *, int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, struct rpcrdma_xprt *); +struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, + size_t, gfp_t); +void rpcrdma_free_regbuf(struct rpcrdma_ia *, + struct rpcrdma_regbuf *); + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ -- cgit v1.2.3 From 0ca77dc372110cbed4dbac5e867ffdc60ebccf6a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:04:08 -0500 Subject: xprtrdma: Allocate RPC send buffer separately from struct rpcrdma_req Because internal memory registration is an expensive and synchronous operation, xprtrdma pre-registers send and receive buffers at mount time, and then re-uses them for each RPC. A "hardway" allocation is a memory allocation and registration that replaces a send buffer during the processing of an RPC. Hardway must be done if the RPC send buffer is too small to accommodate an RPC's call and reply headers. For xprtrdma, each RPC send buffer is currently part of struct rpcrdma_req so that xprt_rdma_free(), which is passed nothing but the address of an RPC send buffer, can find its matching struct rpcrdma_req and rpcrdma_rep quickly via container_of / offsetof. That means that hardway currently has to replace a whole rpcrmda_req when it replaces an RPC send buffer. This is often a fairly hefty chunk of contiguous memory due to the size of the rl_segments array and the fact that both the send and receive buffers are part of struct rpcrdma_req. Some obscure re-use of fields in rpcrdma_req is done so that xprt_rdma_free() can detect replaced rpcrdma_req structs, and restore the original. This commit breaks apart the RPC send buffer and struct rpcrdma_req so that increasing the size of the rl_segments array does not change the alignment of each RPC send buffer. (Increasing rl_segments is needed to bump up the maximum r/wsize for NFS/RDMA). This change opens up some interesting possibilities for improving the design of xprt_rdma_allocate(). xprt_rdma_allocate() is now the one place where RPC send buffers are allocated or re-allocated, and they are now always left in place by xprt_rdma_free(). A large re-allocation that includes both the rl_segments array and the RPC send buffer is no longer needed. Send buffer re-allocation becomes quite rare. Good send buffer alignment is guaranteed no matter what the size of the rl_segments array is. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 6 +- net/sunrpc/xprtrdma/transport.c | 146 ++++++++++++++++------------------------ net/sunrpc/xprtrdma/verbs.c | 16 ++--- net/sunrpc/xprtrdma/xprt_rdma.h | 14 ++-- 4 files changed, 78 insertions(+), 104 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f2eda155299a..8a6bdbd3e936 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -541,9 +541,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = req->rl_iov.lkey; - req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; - req->rl_send_iov[1].lkey = req->rl_iov.lkey; + req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 2; @@ -556,7 +556,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; req->rl_send_iov[3].length = rqst->rq_slen - rpclen; - req->rl_send_iov[3].lkey = req->rl_iov.lkey; + req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 4; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 808b3c52427a..a9d566227e7e 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -449,77 +449,72 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) /* * The RDMA allocate/free functions need the task structure as a place * to hide the struct rpcrdma_req, which is necessary for the actual send/recv - * sequence. For this reason, the recv buffers are attached to send - * buffers for portions of the RPC. Note that the RPC layer allocates - * both send and receive buffers in the same call. We may register - * the receive buffer portion when using reply chunks. + * sequence. + * + * The RPC layer allocates both send and receive buffers in the same call + * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer). + * We may register rq_rcv_buf when using reply chunks. */ static void * xprt_rdma_allocate(struct rpc_task *task, size_t size) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - struct rpcrdma_req *req, *nreq; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_regbuf *rb; + struct rpcrdma_req *req; + size_t min_size; + gfp_t flags = task->tk_flags & RPC_TASK_SWAPPER ? + GFP_ATOMIC : GFP_NOFS; - req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); + req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) return NULL; - if (size > req->rl_size) { - dprintk("RPC: %s: size %zd too large for buffer[%zd]: " - "prog %d vers %d proc %d\n", - __func__, size, req->rl_size, - task->tk_client->cl_prog, task->tk_client->cl_vers, - task->tk_msg.rpc_proc->p_proc); - /* - * Outgoing length shortage. Our inline write max must have - * been configured to perform direct i/o. - * - * This is therefore a large metadata operation, and the - * allocate call was made on the maximum possible message, - * e.g. containing long filename(s) or symlink data. In - * fact, while these metadata operations *might* carry - * large outgoing payloads, they rarely *do*. However, we - * have to commit to the request here, so reallocate and - * register it now. The data path will never require this - * reallocation. - * - * If the allocation or registration fails, the RPC framework - * will (doggedly) retry. - */ - if (task->tk_flags & RPC_TASK_SWAPPER) - nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); - else - nreq = kmalloc(sizeof *req + size, GFP_NOFS); - if (nreq == NULL) - goto outfail; - - if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia, - nreq->rl_base, size + sizeof(struct rpcrdma_req) - - offsetof(struct rpcrdma_req, rl_base), - &nreq->rl_handle, &nreq->rl_iov)) { - kfree(nreq); - goto outfail; - } - rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size; - nreq->rl_size = size; - nreq->rl_niovs = 0; - nreq->rl_nchunks = 0; - nreq->rl_buffer = (struct rpcrdma_buffer *)req; - nreq->rl_reply = req->rl_reply; - memcpy(nreq->rl_segments, - req->rl_segments, sizeof nreq->rl_segments); - /* flag the swap with an unused field */ - nreq->rl_iov.length = 0; - req->rl_reply = NULL; - req = nreq; - } + if (req->rl_sendbuf == NULL) + goto out_sendbuf; + if (size > req->rl_sendbuf->rg_size) + goto out_sendbuf; + +out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ - return req->rl_xdr_buf; - -outfail: + return req->rl_sendbuf->rg_base; + +out_sendbuf: + /* XDR encoding and RPC/RDMA marshaling of this request has not + * yet occurred. Thus a lower bound is needed to prevent buffer + * overrun during marshaling. + * + * RPC/RDMA marshaling may choose to send payload bearing ops + * inline, if the result is smaller than the inline threshold. + * The value of the "size" argument accounts for header + * requirements but not for the payload in these cases. + * + * Likewise, allocate enough space to receive a reply up to the + * size of the inline threshold. + * + * It's unlikely that both the send header and the received + * reply will be large, but slush is provided here to allow + * flexibility when marshaling. + */ + min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp); + min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); + if (size < min_size) + size = min_size; + + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags); + if (IS_ERR(rb)) + goto out_fail; + rb->rg_owner = req; + + r_xprt->rx_stats.hardway_register_count += size; + rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf); + req->rl_sendbuf = rb; + goto out; + +out_fail: rpcrdma_buffer_put(req); - rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; + r_xprt->rx_stats.failed_marshal_count++; return NULL; } @@ -531,47 +526,24 @@ xprt_rdma_free(void *buffer) { struct rpcrdma_req *req; struct rpcrdma_xprt *r_xprt; - struct rpcrdma_rep *rep; + struct rpcrdma_regbuf *rb; int i; if (buffer == NULL) return; - req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); - if (req->rl_iov.length == 0) { /* see allocate above */ - r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer, - struct rpcrdma_xprt, rx_buf); - } else - r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - rep = req->rl_reply; + rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); + req = rb->rg_owner; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); - dprintk("RPC: %s: called on 0x%p%s\n", - __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); + dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - /* - * Finish the deregistration. The process is considered - * complete when the rr_func vector becomes NULL - this - * was put in place during rpcrdma_reply_handler() - the wait - * call below will not block if the dereg is "done". If - * interrupted, our framework will clean up. - */ for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; i += rpcrdma_deregister_external( &req->rl_segments[i], r_xprt); } - if (req->rl_iov.length == 0) { /* see allocate above */ - struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; - oreq->rl_reply = req->rl_reply; - (void) rpcrdma_deregister_internal(&r_xprt->rx_ia, - req->rl_handle, - &req->rl_iov); - kfree(req); - req = oreq; - } - - /* Put back request+reply buffers */ rpcrdma_buffer_put(req); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index cdd6aacc9168..40894403db81 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1079,25 +1079,22 @@ static struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - size_t wlen = 1 << fls(cdata->inline_wsize + - sizeof(struct rpcrdma_req)); + size_t wlen = cdata->inline_wsize; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_req *req; int rc; rc = -ENOMEM; - req = kmalloc(wlen, GFP_KERNEL); + req = kmalloc(sizeof(*req) + wlen, GFP_KERNEL); if (req == NULL) goto out; - memset(req, 0, sizeof(struct rpcrdma_req)); + memset(req, 0, sizeof(*req)); - rc = rpcrdma_register_internal(ia, req->rl_base, wlen - - offsetof(struct rpcrdma_req, rl_base), + rc = rpcrdma_register_internal(ia, req->rl_base, wlen, &req->rl_handle, &req->rl_iov); if (rc) goto out_free; - req->rl_size = wlen - sizeof(struct rpcrdma_req); req->rl_buffer = &r_xprt->rx_buf; return req; @@ -1121,7 +1118,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep = kmalloc(rlen, GFP_KERNEL); if (rep == NULL) goto out; - memset(rep, 0, sizeof(struct rpcrdma_rep)); + memset(rep, 0, sizeof(*rep)); rc = rpcrdma_register_internal(ia, rep->rr_base, rlen - offsetof(struct rpcrdma_rep, rr_base), @@ -1335,6 +1332,7 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) if (!req) return; + rpcrdma_free_regbuf(ia, req->rl_sendbuf); rpcrdma_deregister_internal(ia, req->rl_handle, &req->rl_iov); kfree(req); } @@ -1729,8 +1727,6 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) struct rpcrdma_buffer *buffers = req->rl_buffer; unsigned long flags; - if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ - buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_recv_index < buffers->rb_max_requests) { req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 36c37c60f1fe..aa82f8d1c5b4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -262,7 +262,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ }; struct rpcrdma_req { - size_t rl_size; /* actual length of buffer */ unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ @@ -271,13 +270,20 @@ struct rpcrdma_req { struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ struct ib_sge rl_send_iov[4]; /* for active requests */ + struct rpcrdma_regbuf *rl_sendbuf; struct ib_sge rl_iov; /* for posting */ struct ib_mr *rl_handle; /* handle for mem in rl_iov */ char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ - __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */ }; -#define rpcr_to_rdmar(r) \ - container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) + +static inline struct rpcrdma_req * +rpcr_to_rdmar(struct rpc_rqst *rqst) +{ + struct rpcrdma_regbuf *rb = container_of(rqst->rq_buffer, + struct rpcrdma_regbuf, + rg_base[0]); + return rb->rg_owner; +} /* * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for -- cgit v1.2.3 From 85275c874eaeb92fb2a78a1d4ebb1ff4b0f7b732 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:04:16 -0500 Subject: xprtrdma: Allocate RPC/RDMA send buffer separately from struct rpcrdma_req The rl_base field is currently the buffer where each RPC/RDMA call header is built. The inline threshold is an agreed-on size limit to for RDMA SEND operations that pass between client and server. The sum of the RPC/RDMA header size and the RPC header size must be less than or equal to this threshold. Increasing the r/wsize maximum will require MAX_SEGS to grow significantly, but the inline threshold size won't change (both sides agree on it). The server's inline threshold doesn't change. Since an RPC/RDMA header can never be larger than the inline threshold, make all RPC/RDMA header buffers the size of the inline threshold. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 11 +++++------ net/sunrpc/xprtrdma/transport.c | 9 +++++++++ net/sunrpc/xprtrdma/verbs.c | 22 +++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 6 ++---- 4 files changed, 19 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 8a6bdbd3e936..c1d4a093b8f1 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -294,7 +294,7 @@ ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; + struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); if (req->rl_rtype != rpcrdma_noch) result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, @@ -406,8 +406,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) base = rqst->rq_svec[0].iov_base; rpclen = rqst->rq_svec[0].iov_len; - /* build RDMA header in private area at front */ - headerp = (struct rpcrdma_msg *) req->rl_base; + headerp = rdmab_to_msg(req->rl_rdmabuf); /* don't byte-swap XID, it's already done in request */ headerp->rm_xid = rqst->rq_xid; headerp->rm_vers = rpcrdma_version; @@ -528,7 +527,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, - headerp, base, req->rl_iov.lkey); + headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* * initialize send_iov's - normally only two: rdma chunk header and @@ -537,9 +536,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * header and any write data. In all non-rdma cases, any following * data has been copied into the RPC header buffer. */ - req->rl_send_iov[0].addr = req->rl_iov.addr; + req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = hdrlen; - req->rl_send_iov[0].lkey = req->rl_iov.lkey; + req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a9d566227e7e..2c2fabe99d84 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -470,6 +470,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) if (req == NULL) return NULL; + if (req->rl_rdmabuf == NULL) + goto out_rdmabuf; if (req->rl_sendbuf == NULL) goto out_sendbuf; if (size > req->rl_sendbuf->rg_size) @@ -480,6 +482,13 @@ out: req->rl_connect_cookie = 0; /* our reserved value */ return req->rl_sendbuf->rg_base; +out_rdmabuf: + min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp); + rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags); + if (IS_ERR(rb)) + goto out_fail; + req->rl_rdmabuf = rb; + out_sendbuf: /* XDR encoding and RPC/RDMA marshaling of this request has not * yet occurred. Thus a lower bound is needed to prevent buffer diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 40894403db81..c81749b9a0de 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1078,30 +1078,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) static struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - size_t wlen = cdata->inline_wsize; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_req *req; - int rc; - rc = -ENOMEM; - req = kmalloc(sizeof(*req) + wlen, GFP_KERNEL); + req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) - goto out; - memset(req, 0, sizeof(*req)); - - rc = rpcrdma_register_internal(ia, req->rl_base, wlen, - &req->rl_handle, &req->rl_iov); - if (rc) - goto out_free; + return ERR_PTR(-ENOMEM); req->rl_buffer = &r_xprt->rx_buf; return req; - -out_free: - kfree(req); -out: - return ERR_PTR(rc); } static struct rpcrdma_rep * @@ -1333,7 +1317,7 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) return; rpcrdma_free_regbuf(ia, req->rl_sendbuf); - rpcrdma_deregister_internal(ia, req->rl_handle, &req->rl_iov); + rpcrdma_free_regbuf(ia, req->rl_rdmabuf); kfree(req); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index aa82f8d1c5b4..84ad863fe637 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -268,12 +268,10 @@ struct rpcrdma_req { enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ struct ib_sge rl_send_iov[4]; /* for active requests */ + struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; - struct ib_sge rl_iov; /* for posting */ - struct ib_mr *rl_handle; /* handle for mem in rl_iov */ - char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * -- cgit v1.2.3 From 6b1184cd4fb086a826f658b02d9d9912dd0dde08 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:04:25 -0500 Subject: xprtrdma: Allocate RPC/RDMA receive buffer separately from struct rpcrdma_rep The rr_base field is currently the buffer where RPC replies land. An RPC/RDMA reply header lands in this buffer. In some cases an RPC reply header also lands in this buffer, just after the RPC/RDMA header. The inline threshold is an agreed-on size limit for RDMA SEND operations that pass from server and client. The sum of the RPC/RDMA reply header size and the RPC reply header size must be less than this threshold. The largest RDMA RECV that the client should have to handle is the size of the inline threshold. The receive buffer should thus be the size of the inline threshold, and not related to RPCRDMA_MAX_SEGS. RPC replies received via RDMA WRITE (long replies) are caught in rq_rcv_buf, which is the second half of the RPC send buffer. Ie, such replies are not involved in any way with rr_base. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 5 +++-- net/sunrpc/xprtrdma/verbs.c | 27 ++++++++++++++------------- net/sunrpc/xprtrdma/xprt_rdma.h | 14 ++++++-------- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c1d4a093b8f1..02efcaa1bbac 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -572,6 +572,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b { unsigned int i, total_len; struct rpcrdma_write_chunk *cur_wchunk; + char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); i = be32_to_cpu(**iptrp); if (i > max) @@ -599,7 +600,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b return -1; cur_wchunk = (struct rpcrdma_write_chunk *) w; } - if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) + if ((char *)cur_wchunk > base + rep->rr_len) return -1; *iptrp = (__be32 *) cur_wchunk; @@ -753,7 +754,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; } - headerp = (struct rpcrdma_msg *) rep->rr_base; + headerp = rdmab_to_msg(rep->rr_rdmabuf); if (headerp->rm_vers != rpcrdma_version) { dprintk("RPC: %s: invalid version %d\n", __func__, be32_to_cpu(headerp->rm_vers)); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c81749b9a0de..f58521dd88e2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -298,8 +298,9 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) rep->rr_len = wc->byte_len; ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, - rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); - prefetch(rep->rr_base); + rdmab_addr(rep->rr_rdmabuf), + rep->rr_len, DMA_FROM_DEVICE); + prefetch(rdmab_to_msg(rep->rr_rdmabuf)); out_schedule: list_add_tail(&rep->rr_list, sched_list); @@ -1092,23 +1093,21 @@ static struct rpcrdma_rep * rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - size_t rlen = 1 << fls(cdata->inline_rsize + - sizeof(struct rpcrdma_rep)); struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_rep *rep; int rc; rc = -ENOMEM; - rep = kmalloc(rlen, GFP_KERNEL); + rep = kzalloc(sizeof(*rep), GFP_KERNEL); if (rep == NULL) goto out; - memset(rep, 0, sizeof(*rep)); - rc = rpcrdma_register_internal(ia, rep->rr_base, rlen - - offsetof(struct rpcrdma_rep, rr_base), - &rep->rr_handle, &rep->rr_iov); - if (rc) + rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, + GFP_KERNEL); + if (IS_ERR(rep->rr_rdmabuf)) { + rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; + } rep->rr_buffer = &r_xprt->rx_buf; return rep; @@ -1306,7 +1305,7 @@ rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) if (!rep) return; - rpcrdma_deregister_internal(ia, rep->rr_handle, &rep->rr_iov); + rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); kfree(rep); } @@ -2209,11 +2208,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, recv_wr.next = NULL; recv_wr.wr_id = (u64) (unsigned long) rep; - recv_wr.sg_list = &rep->rr_iov; + recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; recv_wr.num_sge = 1; ib_dma_sync_single_for_cpu(ia->ri_id->device, - rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); + rdmab_addr(rep->rr_rdmabuf), + rdmab_length(rep->rr_rdmabuf), + DMA_BIDIRECTIONAL); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 84ad863fe637..2b69316dfd11 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -180,14 +180,12 @@ enum rpcrdma_chunktype { struct rpcrdma_buffer; struct rpcrdma_rep { - unsigned int rr_len; /* actual received reply length */ - struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ - struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ - void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ - struct list_head rr_list; /* tasklet list */ - struct ib_sge rr_iov; /* for posting */ - struct ib_mr *rr_handle; /* handle for mem in rr_iov */ - char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ + unsigned int rr_len; + struct rpcrdma_buffer *rr_buffer; + struct rpc_xprt *rr_xprt; + void (*rr_func)(struct rpcrdma_rep *); + struct list_head rr_list; + struct rpcrdma_regbuf *rr_rdmabuf; }; /* -- cgit v1.2.3 From c05fbb5a593571961fdb4ba06a2bff49aed9dcee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:04:33 -0500 Subject: xprtrdma: Allocate zero pad separately from rpcrdma_buffer Use the new rpcrdma_alloc_regbuf() API to shrink the amount of contiguous memory needed for a buffer pool by moving the zero pad buffer into a regbuf. This is for consistency with the other uses of internally registered memory. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/verbs.c | 29 ++++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 3 +-- 3 files changed, 13 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 02efcaa1bbac..7e9acd9361c5 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -549,9 +549,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (padlen) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - req->rl_send_iov[2].addr = ep->rep_pad.addr; + req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); req->rl_send_iov[2].length = padlen; - req->rl_send_iov[2].lkey = ep->rep_pad.lkey; + req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; req->rl_send_iov[3].length = rqst->rq_slen - rpclen; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f58521dd88e2..8a05f45d1a11 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -794,6 +794,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.qp_type = IB_QPT_RC; ep->rep_attr.port_num = ~0; + if (cdata->padding) { + ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, + GFP_KERNEL); + if (IS_ERR(ep->rep_padbuf)) + return PTR_ERR(ep->rep_padbuf); + } else + ep->rep_padbuf = NULL; + dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, @@ -876,6 +884,7 @@ out2: dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); out1: + rpcrdma_free_regbuf(ia, ep->rep_padbuf); return rc; } @@ -902,11 +911,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } - /* padding - could be done in rpcrdma_buffer_destroy... */ - if (ep->rep_pad_mr) { - rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); - ep->rep_pad_mr = NULL; - } + rpcrdma_free_regbuf(ia, ep->rep_padbuf); rpcrdma_clean_cq(ep->rep_attr.recv_cq); rc = ib_destroy_cq(ep->rep_attr.recv_cq); @@ -1220,12 +1225,10 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) * 1. arrays for send and recv pointers * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies - * 4. padding, if any * Send/recv buffers in req/rep need to be registered */ len = buf->rb_max_requests * (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); - len += cdata->padding; p = kzalloc(len, GFP_KERNEL); if (p == NULL) { @@ -1241,18 +1244,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - /* - * Register the zeroed pad buffer, if any. - */ - if (cdata->padding) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - rc = rpcrdma_register_internal(ia, p, cdata->padding, - &ep->rep_pad_mr, &ep->rep_pad); - if (rc) - goto out; - } - p += cdata->padding; - INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); switch (ia->ri_memreg_strategy) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2b69316dfd11..5630353ed240 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -88,8 +88,7 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; - struct ib_sge rep_pad; /* holds zeroed pad */ - struct ib_mr *rep_pad_mr; /* holds zeroed pad */ + struct rpcrdma_regbuf *rep_padbuf; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; -- cgit v1.2.3 From df515ca7b3b47bf6fd489fe6fca0d9ab243e1985 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 21 Jan 2015 11:04:41 -0500 Subject: xprtrdma: Clean up after adding regbuf management rpcrdma_{de}register_internal() are used only in verbs.c now. MAX_RPCRDMAHDR is no longer used and can be removed. Signed-off-by: Chuck Lever Reviewed-by: Steve Wise Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 9 --------- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8a05f45d1a11..124676c13780 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1729,7 +1729,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ -int +static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) { @@ -1780,7 +1780,7 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, return rc; } -int +static int rpcrdma_deregister_internal(struct rpcrdma_ia *ia, struct ib_mr *mr, struct ib_sge *iov) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5630353ed240..c9d2a02f631b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -171,10 +171,6 @@ enum rpcrdma_chunktype { /* temporary static scatter/gather max */ #define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ -#define MAX_RPCRDMAHDR (\ - /* max supported RPC/RDMA header */ \ - sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ - (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) struct rpcrdma_buffer; @@ -401,11 +397,6 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, - struct ib_mr **, struct ib_sge *); -int rpcrdma_deregister_internal(struct rpcrdma_ia *, - struct ib_mr *, struct ib_sge *); - int rpcrdma_register_external(struct rpcrdma_mr_seg *, int, int, struct rpcrdma_xprt *); int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, -- cgit v1.2.3 From a0a1d50cd1e80652142af5cddcde500d06c71bdd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 26 Jan 2015 17:11:47 -0500 Subject: xprtrdma: Update the GFP flags used in xprt_rdma_allocate() Reflect the more conservative approach used in the socket transport's version of this transport method. An RPC buffer allocation should avoid forcing not just FS activity, but any I/O. In particular, two recent changes missed updating xprtrdma: - Commit c6c8fe79a83e ("net, sunrpc: suppress allocation warning ...") - Commit a564b8f03986 ("nfs: enable swap on NFS") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2c2fabe99d84..2e192baa59f3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -463,13 +463,16 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; size_t min_size; - gfp_t flags = task->tk_flags & RPC_TASK_SWAPPER ? - GFP_ATOMIC : GFP_NOFS; + gfp_t flags; req = rpcrdma_buffer_get(&r_xprt->rx_buf); if (req == NULL) return NULL; + flags = GFP_NOIO | __GFP_NOWARN; + if (RPC_IS_SWAPPER(task)) + flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; + if (req->rl_rdmabuf == NULL) goto out_rdmabuf; if (req->rl_sendbuf == NULL) -- cgit v1.2.3 From 840210fc4872bcbc17ab4f435f28021dce9d0aff Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 24 Jun 2014 10:59:52 -0400 Subject: sunrpc: add rpc_count_iostats_idx Add a call to tally stats for a task under a different statsidx than what's contained in the task structure. This is needed to properly account for pnfs reads/writes when the DS nfs version != the MDS version. Signed-off-by: Weston Andros Adamson Signed-off-by: Tom Haynes --- net/sunrpc/stats.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 9711a155bc50..2ecb994314c1 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -140,22 +140,20 @@ void rpc_free_iostats(struct rpc_iostats *stats) EXPORT_SYMBOL_GPL(rpc_free_iostats); /** - * rpc_count_iostats - tally up per-task stats + * rpc_count_iostats_metrics - tally up per-task stats * @task: completed rpc_task - * @stats: array of stat structures + * @op_metrics: stat structure for OP that will accumulate stats from @task */ -void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) +void rpc_count_iostats_metrics(const struct rpc_task *task, + struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; - struct rpc_iostats *op_metrics; ktime_t delta, now; - if (!stats || !req) + if (!op_metrics || !req) return; now = ktime_get(); - op_metrics = &stats[task->tk_msg.rpc_proc->p_statidx]; - spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; @@ -175,6 +173,20 @@ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) spin_unlock(&op_metrics->om_lock); } +EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); + +/** + * rpc_count_iostats - tally up per-task stats + * @task: completed rpc_task + * @stats: array of stat structures + * + * Uses the statidx from @task + */ +void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) +{ + rpc_count_iostats_metrics(task, + &stats[task->tk_msg.rpc_proc->p_statidx]); +} EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, -- cgit v1.2.3 From 03a9a42a1a7e5b3e7919ddfacc1d1cc81882a955 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 30 Jan 2015 18:12:28 -0500 Subject: SUNRPC: NULL utsname dereference on NFS umount during namespace cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix an Oopsable condition when nsm_mon_unmon is called as part of the namespace cleanup, which now apparently happens after the utsname has been freed. Link: http://lkml.kernel.org/r/20150125220604.090121ae@neptune.home Reported-by: Bruno Prémont Cc: stable@vger.kernel.org # 3.18 Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 12 +++++++----- net/sunrpc/rpcb_clnt.c | 8 ++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 05da12a33945..3f5d4d48f0cb 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -286,10 +286,8 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) { - clnt->cl_nodelen = strlen(nodename); - if (clnt->cl_nodelen > UNX_MAXNODENAME) - clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, nodename, clnt->cl_nodelen); + clnt->cl_nodelen = strlcpy(clnt->cl_nodename, + nodename, sizeof(clnt->cl_nodename)); } static int rpc_client_register(struct rpc_clnt *clnt, @@ -365,6 +363,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, const struct rpc_version *version; struct rpc_clnt *clnt = NULL; const struct rpc_timeout *timeout; + const char *nodename = args->nodename; int err; /* sanity check the name before trying to print it */ @@ -420,8 +419,10 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, atomic_set(&clnt->cl_count, 1); + if (nodename == NULL) + nodename = utsname()->nodename; /* save the nodename */ - rpc_clnt_set_nodename(clnt, utsname()->nodename); + rpc_clnt_set_nodename(clnt, nodename); err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) @@ -576,6 +577,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, if (xprt == NULL) goto out_err; args->servername = xprt->servername; + args->nodename = clnt->cl_nodename; new = rpc_new_client(args, xprt, clnt); if (IS_ERR(new)) { diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 05202012bcfc..cf5770d8f49a 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -355,7 +355,8 @@ out: return result; } -static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname, +static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, + const char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version) { @@ -365,6 +366,7 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *hostname, .address = srvaddr, .addrsize = salen, .servername = hostname, + .nodename = nodename, .program = &rpcb_program, .version = version, .authflavor = RPC_AUTH_UNIX, @@ -740,7 +742,9 @@ void rpcb_getport_async(struct rpc_task *task) dprintk("RPC: %5u %s: trying rpcbind version %u\n", task->tk_pid, __func__, bind_version); - rpcb_clnt = rpcb_create(xprt->xprt_net, xprt->servername, sap, salen, + rpcb_clnt = rpcb_create(xprt->xprt_net, + clnt->cl_nodename, + xprt->servername, sap, salen, xprt->prot, bind_version); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); -- cgit v1.2.3 From b625a61698619c7af652de2701a2fb17c5c5d66e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 4 Feb 2015 16:59:32 -0500 Subject: xprtrdma: Address sparse complaint in rpcr_to_rdmar() With "make ARCH=x86_64 allmodconfig make C=1 CF=-D__CHECK_ENDIAN__": linux-2.6/net/sunrpc/xprtrdma/xprt_rdma.h:273:30: warning: incorrect type in initializer (different base types) linux-2.6/net/sunrpc/xprtrdma/xprt_rdma.h:273:30: expected restricted __be32 [usertype] *buffer linux-2.6/net/sunrpc/xprtrdma/xprt_rdma.h:273:30: got unsigned int [usertype] *rq_buffer As far as I can tell this is a false positive. Reported-by: kbuild-all@01.org Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c9d2a02f631b..d1b70397c60f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -270,9 +270,10 @@ struct rpcrdma_req { static inline struct rpcrdma_req * rpcr_to_rdmar(struct rpc_rqst *rqst) { - struct rpcrdma_regbuf *rb = container_of(rqst->rq_buffer, - struct rpcrdma_regbuf, - rg_base[0]); + void *buffer = rqst->rq_buffer; + struct rpcrdma_regbuf *rb; + + rb = container_of(buffer, struct rpcrdma_regbuf, rg_base); return rb->rg_owner; } -- cgit v1.2.3 From 4dda9c8a5e34773b290c6b5938ccb36e7fcdf35c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 15:00:06 -0500 Subject: SUNRPC: Set SO_REUSEPORT socket option for TCP connections When using TCP, we need the ability to reuse port numbers after a disconnection, so that the NFSv3 server knows that we're the same client. Currently we use a hack to work around the TCP socket's TIME_WAIT: we send an RST instead of closing, which doesn't always work... The SO_REUSEPORT option added in Linux 3.9 allows us to bind multiple TCP connections to the same source address+port combination, and thus to use ordinary TCP close() instead of the current hack. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 87ce7e8bb8dc..484c5040436a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1666,6 +1666,39 @@ static unsigned short xs_get_random_port(void) return rand + xprt_min_resvport; } +/** + * xs_set_reuseaddr_port - set the socket's port and address reuse options + * @sock: socket + * + * Note that this function has to be called on all sockets that share the + * same port, and it must be called before binding. + */ +static void xs_sock_set_reuseport(struct socket *sock) +{ + char opt = 1; + + kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); +} + +static unsigned short xs_sock_getport(struct socket *sock) +{ + struct sockaddr_storage buf; + int buflen; + unsigned short port = 0; + + if (kernel_getsockname(sock, (struct sockaddr *)&buf, &buflen) < 0) + goto out; + switch (buf.ss_family) { + case AF_INET6: + port = ntohs(((struct sockaddr_in6 *)&buf)->sin6_port); + break; + case AF_INET: + port = ntohs(((struct sockaddr_in *)&buf)->sin_port); + } +out: + return port; +} + /** * xs_set_port - reset the port number in the remote endpoint address * @xprt: generic transport @@ -1680,6 +1713,12 @@ static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) xs_update_peer_port(xprt); } +static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) +{ + if (transport->srcport == 0) + transport->srcport = xs_sock_getport(sock); +} + static unsigned short xs_get_srcport(struct sock_xprt *transport) { unsigned short port = transport->srcport; @@ -1833,7 +1872,8 @@ static void xs_dummy_setup_socket(struct work_struct *work) } static struct socket *xs_create_sock(struct rpc_xprt *xprt, - struct sock_xprt *transport, int family, int type, int protocol) + struct sock_xprt *transport, int family, int type, + int protocol, bool reuseport) { struct socket *sock; int err; @@ -1846,6 +1886,9 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, } xs_reclassify_socket(family, sock); + if (reuseport) + xs_sock_set_reuseport(sock); + err = xs_bind(transport, sock); if (err) { sock_release(sock); @@ -2047,7 +2090,8 @@ static void xs_udp_setup_socket(struct work_struct *work) /* Start by resetting any existing state */ xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, - xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP); + xs_addr(xprt)->sa_family, SOCK_DGRAM, + IPPROTO_UDP, false); if (IS_ERR(sock)) goto out; @@ -2149,7 +2193,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_allocation = GFP_ATOMIC; /* socket options */ - sk->sk_userlocks |= SOCK_BINDPORT_LOCK; sock_reset_flag(sk, SOCK_LINGER); tcp_sk(sk)->linger2 = 0; tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; @@ -2174,6 +2217,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { case 0: + xs_set_srcport(transport, sock); case -EINPROGRESS: /* SYN_SENT! */ if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) @@ -2202,7 +2246,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); sock = xs_create_sock(xprt, transport, - xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP); + xs_addr(xprt)->sa_family, SOCK_STREAM, + IPPROTO_TCP, true); if (IS_ERR(sock)) { status = PTR_ERR(sock); goto out; -- cgit v1.2.3 From 3913c78c3ab61500ddf7c2c9617cc4f8e2c583e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 21:44:04 -0500 Subject: SUNRPC: Handle EADDRINUSE on connect Now that we're setting SO_REUSEPORT, we still need to handle the case where a connect() is attempted, but the old socket is still lingering. Essentially, all we want to do here is handle the error by waiting a few seconds and then retrying. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 3 +++ net/sunrpc/xprtsock.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3f5d4d48f0cb..612aa73bbc60 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1826,6 +1826,7 @@ call_connect_status(struct rpc_task *task) case -ECONNABORTED: case -ENETUNREACH: case -EHOSTUNREACH: + case -EADDRINUSE: case -ENOBUFS: case -EPIPE: if (RPC_IS_SOFTCONN(task)) @@ -1934,6 +1935,7 @@ call_transmit_status(struct rpc_task *task) } case -ECONNRESET: case -ECONNABORTED: + case -EADDRINUSE: case -ENOTCONN: case -ENOBUFS: case -EPIPE: @@ -2053,6 +2055,7 @@ call_status(struct rpc_task *task) case -ECONNRESET: case -ECONNABORTED: rpc_force_rebind(clnt); + case -EADDRINUSE: case -ENOBUFS: rpc_delay(task, 3*HZ); case -EPIPE: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 484c5040436a..20f25a837e06 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -721,6 +721,7 @@ static int xs_tcp_send_request(struct rpc_task *task) xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: + case -EADDRINUSE: case -EPIPE: clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); } @@ -2299,6 +2300,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -ECONNREFUSED: case -ECONNRESET: case -ENETUNREACH: + case -EADDRINUSE: case -ENOBUFS: /* retry with existing socket, after a delay */ goto out; -- cgit v1.2.3 From 76698b2358de466d23f44eaa1b0c9ebe8206099a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 16:28:58 -0500 Subject: SUNRPC: Do not clear the source port in xs_reset_transport Now that we can reuse bound ports after a close, we never really want to clear the transport's source port after it has been set. Doing so really messes up the NFSv3 DRC on the server. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 20f25a837e06..ea1882f97912 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -811,8 +811,6 @@ static void xs_reset_transport(struct sock_xprt *transport) if (sk == NULL) return; - transport->srcport = 0; - write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; -- cgit v1.2.3 From 6cc7e908362a9dfec3c821f77ec98b6758592060 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 18:35:25 -0500 Subject: SUNRPC: Ensure xs_reset_transport() resets the close connection flags Otherwise, we may end up looping. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ea1882f97912..0fa7ed93dc20 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -803,10 +803,21 @@ static void xs_error_report(struct sock *sk) read_unlock_bh(&sk->sk_callback_lock); } +static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) +{ + smp_mb__before_atomic(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_atomic(); +} + static void xs_reset_transport(struct sock_xprt *transport) { struct socket *sock = transport->sock; struct sock *sk = transport->inet; + struct rpc_xprt *xprt = &transport->xprt; if (sk == NULL) return; @@ -819,8 +830,9 @@ static void xs_reset_transport(struct sock_xprt *transport) xs_restore_old_callbacks(transport, sk); write_unlock_bh(&sk->sk_callback_lock); + xs_sock_reset_connection_flags(xprt); - trace_rpc_socket_close(&transport->xprt, sock); + trace_rpc_socket_close(xprt, sock); sock_release(sock); } @@ -845,11 +857,6 @@ static void xs_close(struct rpc_xprt *xprt) xs_reset_transport(transport); xprt->reestablish_timeout = 0; - smp_mb__before_atomic(); - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_atomic(); xprt_disconnect_done(xprt); } @@ -1455,16 +1462,6 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) xprt_clear_connecting(xprt); } -static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) -{ - smp_mb__before_atomic(); - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); - clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_atomic(); -} - static void xs_sock_mark_closed(struct rpc_xprt *xprt) { xs_sock_reset_connection_flags(xprt); -- cgit v1.2.3 From 718ba5b87343df303017585200ee182e937eabfc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 18:19:25 -0500 Subject: SUNRPC: Add helpers to prevent socket create from racing The socket lock is currently held by the task that is requesting the connection be established. While that is efficient in the case where the connection happens quickly, it is racy in the case where it doesn't. What we really want is for the connect helper to be able to block access to the socket while it is being set up. This patch does so by arranging to transfer the socket lock from the task that is requesting the connect attempt, and then releasing that lock once everything is done. This scheme also gives us automatic protection against collisions with the RPC close code, so we can kill the cancel_delayed_work_sync() call in xs_close(). Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 37 +++++++++++++++++++++++++++++++++---- net/sunrpc/xprtsock.c | 7 +++++-- 2 files changed, 38 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ebbefad21a37..ff3574df8344 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -690,6 +690,37 @@ out_abort: spin_unlock(&xprt->transport_lock); } +bool xprt_lock_connect(struct rpc_xprt *xprt, + struct rpc_task *task, + void *cookie) +{ + bool ret = false; + + spin_lock_bh(&xprt->transport_lock); + if (!test_bit(XPRT_LOCKED, &xprt->state)) + goto out; + if (xprt->snd_task != task) + goto out; + xprt->snd_task = cookie; + ret = true; +out: + spin_unlock_bh(&xprt->transport_lock); + return ret; +} + +void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) +{ + spin_lock_bh(&xprt->transport_lock); + if (xprt->snd_task != cookie) + goto out; + if (!test_bit(XPRT_LOCKED, &xprt->state)) + goto out; + xprt->snd_task =NULL; + xprt->ops->release_xprt(xprt, NULL); +out: + spin_unlock_bh(&xprt->transport_lock); +} + /** * xprt_connect - schedule a transport connect operation * @task: RPC task that is requesting the connect @@ -712,9 +743,7 @@ void xprt_connect(struct rpc_task *task) if (test_and_clear_bit(XPRT_CLOSE_WAIT, &xprt->state)) xprt->ops->close(xprt); - if (xprt_connected(xprt)) - xprt_release_write(xprt, task); - else { + if (!xprt_connected(xprt)) { task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = task->tk_rqstp->rq_timeout; rpc_sleep_on(&xprt->pending, task, xprt_connect_status); @@ -726,6 +755,7 @@ void xprt_connect(struct rpc_task *task) xprt->stat.connect_start = jiffies; xprt->ops->connect(xprt, task); } + xprt_release_write(xprt, task); } static void xprt_connect_status(struct rpc_task *task) @@ -758,7 +788,6 @@ static void xprt_connect_status(struct rpc_task *task) dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, xprt->servername); - xprt_release_write(xprt, task); task->tk_status = -EIO; } } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0fa7ed93dc20..e57d8ed2c4d8 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -852,8 +852,6 @@ static void xs_close(struct rpc_xprt *xprt) dprintk("RPC: xs_close xprt %p\n", xprt); - cancel_delayed_work_sync(&transport->connect_worker); - xs_reset_transport(transport); xprt->reestablish_timeout = 0; @@ -2101,6 +2099,7 @@ static void xs_udp_setup_socket(struct work_struct *work) trace_rpc_socket_connect(xprt, sock, 0); status = 0; out: + xprt_unlock_connect(xprt, transport); xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); } @@ -2286,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case 0: case -EINPROGRESS: case -EALREADY: + xprt_unlock_connect(xprt, transport); xprt_clear_connecting(xprt); return; case -EINVAL: @@ -2303,6 +2303,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) out_eagain: status = -EAGAIN; out: + xprt_unlock_connect(xprt, transport); xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); } @@ -2325,6 +2326,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", -- cgit v1.2.3 From de84d89030fa4efa44c02c96c8b4a8176042c4ff Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 16:49:48 -0500 Subject: SUNRPC: TCP/UDP always close the old socket before reconnecting It is not safe to call xs_reset_transport() from inside xs_udp_setup_socket() or xs_tcp_setup_socket(), since they do not own the correct locks. Instead, do it in xs_connect(). Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e57d8ed2c4d8..e53a5ca03daf 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2081,8 +2081,6 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - /* Start by resetting any existing state */ - xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_DGRAM, IPPROTO_UDP, false); @@ -2328,6 +2326,9 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); + /* Start by resetting any existing state */ + xs_reset_transport(transport); + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", -- cgit v1.2.3 From 4efdd92c921135175a85452cd41273d9e2788db3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 15:34:28 -0500 Subject: SUNRPC: Remove TCP client connection reset hack Instead we rely on SO_REUSEPORT to provide the reconnection semantics that we need for NFSv2/v3. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 67 +-------------------------------------------------- 1 file changed, 1 insertion(+), 66 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e53a5ca03daf..dbf279cd4494 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -796,8 +796,6 @@ static void xs_error_report(struct sock *sk) dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -err); trace_rpc_socket_error(xprt, sk->sk_socket, err); - if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state)) - goto out; xprt_wake_pending_tasks(xprt, err); out: read_unlock_bh(&sk->sk_callback_lock); @@ -2102,57 +2100,6 @@ out: xprt_wake_pending_tasks(xprt, status); } -/* - * We need to preserve the port number so the reply cache on the server can - * find our cached RPC replies when we get around to reconnecting. - */ -static void xs_abort_connection(struct sock_xprt *transport) -{ - int result; - struct sockaddr any; - - dprintk("RPC: disconnecting xprt %p to reuse port\n", transport); - - /* - * Disconnect the transport socket by doing a connect operation - * with AF_UNSPEC. This should return immediately... - */ - memset(&any, 0, sizeof(any)); - any.sa_family = AF_UNSPEC; - result = kernel_connect(transport->sock, &any, sizeof(any), 0); - trace_rpc_socket_reset_connection(&transport->xprt, - transport->sock, result); - if (!result) - xs_sock_reset_connection_flags(&transport->xprt); - dprintk("RPC: AF_UNSPEC connect return code %d\n", result); -} - -static void xs_tcp_reuse_connection(struct sock_xprt *transport) -{ - unsigned int state = transport->inet->sk_state; - - if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED) { - /* we don't need to abort the connection if the socket - * hasn't undergone a shutdown - */ - if (transport->inet->sk_shutdown == 0) - return; - dprintk("RPC: %s: TCP_CLOSEd and sk_shutdown set to %d\n", - __func__, transport->inet->sk_shutdown); - } - if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT)) { - /* we don't need to abort the connection if the socket - * hasn't undergone a shutdown - */ - if (transport->inet->sk_shutdown == 0) - return; - dprintk("RPC: %s: ESTABLISHED/SYN_SENT " - "sk_shutdown set to %d\n", - __func__, transport->inet->sk_shutdown); - } - xs_abort_connection(transport); -} - static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -2245,18 +2192,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) status = PTR_ERR(sock); goto out; } - } else { - int abort_and_exit; - - abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, - &xprt->state); - /* "close" the socket, preserving the local port */ - set_bit(XPRT_CONNECTION_REUSE, &xprt->state); - xs_tcp_reuse_connection(transport); - clear_bit(XPRT_CONNECTION_REUSE, &xprt->state); - - if (abort_and_exit) - goto out_eagain; } dprintk("RPC: worker connecting xprt %p via %s to " @@ -2296,9 +2231,9 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EADDRINUSE: case -ENOBUFS: /* retry with existing socket, after a delay */ + xs_tcp_force_close(xprt); goto out; } -out_eagain: status = -EAGAIN; out: xprt_unlock_connect(xprt, transport); -- cgit v1.2.3 From 9cbc94fb06f98de0e8d393eaff09c790f4c3ba46 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 15:50:27 -0500 Subject: SUNRPC: Remove TCP socket linger code Now that we no longer use the partial shutdown code when closing the socket, we no longer need to worry about the TCP linger2 state. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index dbf279cd4494..c65f74019288 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1427,37 +1427,6 @@ out: read_unlock_bh(&sk->sk_callback_lock); } -/* - * Do the equivalent of linger/linger2 handling for dealing with - * broken servers that don't close the socket in a timely - * fashion - */ -static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt, - unsigned long timeout) -{ - struct sock_xprt *transport; - - if (xprt_test_and_set_connecting(xprt)) - return; - set_bit(XPRT_CONNECTION_ABORT, &xprt->state); - transport = container_of(xprt, struct sock_xprt, xprt); - queue_delayed_work(rpciod_workqueue, &transport->connect_worker, - timeout); -} - -static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) -{ - struct sock_xprt *transport; - - transport = container_of(xprt, struct sock_xprt, xprt); - - if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) || - !cancel_delayed_work(&transport->connect_worker)) - return; - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); - xprt_clear_connecting(xprt); -} - static void xs_sock_mark_closed(struct rpc_xprt *xprt) { xs_sock_reset_connection_flags(xprt); @@ -1513,7 +1482,6 @@ static void xs_tcp_state_change(struct sock *sk) clear_bit(XPRT_CONNECTED, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); smp_mb__after_atomic(); - xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ @@ -1530,13 +1498,11 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_LAST_ACK: set_bit(XPRT_CLOSING, &xprt->state); - xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout); smp_mb__before_atomic(); clear_bit(XPRT_CONNECTED, &xprt->state); smp_mb__after_atomic(); break; case TCP_CLOSE: - xs_tcp_cancel_linger_timeout(xprt); xs_sock_mark_closed(xprt); } out: @@ -2134,7 +2100,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* socket options */ sock_reset_flag(sk, SOCK_LINGER); - tcp_sk(sk)->linger2 = 0; tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); -- cgit v1.2.3 From 505936f59f1e4cd0ff92ae5abc7aae64fb74dbdb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 16:00:01 -0500 Subject: SUNRPC: Cleanup to remove remaining uses of XPRT_CONNECTION_ABORT Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c65f74019288..2f8db3499a17 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -804,7 +804,6 @@ static void xs_error_report(struct sock *sk) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { smp_mb__before_atomic(); - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); @@ -1904,7 +1903,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct socket *sock; int status = -EIO; - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); if (status < 0) { @@ -2149,7 +2147,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) int status = -EIO; if (!sock) { - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, SOCK_STREAM, IPPROTO_TCP, true); -- cgit v1.2.3 From 0efeac261c3f79c44fe61ee869722b77805c7ddf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Feb 2015 09:26:39 -0500 Subject: SUNRPC: Ensure xs_tcp_shutdown() requests a full close of the connection The previous behaviour left the connection half-open in order to try to scrape the last replies from the socket. Now that we have more reliable reconnection, change the behaviour to close down the socket faster. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2f8db3499a17..3d83cbd32ef2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -627,7 +627,7 @@ process_status: * @xprt: transport * * Initiates a graceful shutdown of the TCP socket by calling the - * equivalent of shutdown(SHUT_WR); + * equivalent of shutdown(SHUT_RDWR); */ static void xs_tcp_shutdown(struct rpc_xprt *xprt) { @@ -635,7 +635,7 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) struct socket *sock = transport->sock; if (sock != NULL) { - kernel_sock_shutdown(sock, SHUT_WR); + kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); } } -- cgit v1.2.3 From caf4ccd4e88cf2795c927834bc488c8321437586 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Feb 2015 09:23:34 -0500 Subject: SUNRPC: Make xs_tcp_close() do a socket shutdown rather than a sock_release Use of socket shutdown() means that we monitor the shutdown process through the xs_tcp_state_change() callback, so it is preferable to a full close in all cases unless we're destroying the transport. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 3d83cbd32ef2..0279e8ffb14a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -857,10 +857,7 @@ static void xs_close(struct rpc_xprt *xprt) static void xs_tcp_close(struct rpc_xprt *xprt) { - if (test_and_clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state)) - xs_close(xprt); - else - xs_tcp_shutdown(xprt); + xs_tcp_shutdown(xprt); } static void xs_xprt_free(struct rpc_xprt *xprt) @@ -1033,7 +1030,6 @@ static void xs_udp_data_ready(struct sock *sk) */ static void xs_tcp_force_close(struct rpc_xprt *xprt) { - set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); xprt_force_disconnect(xprt); } -- cgit v1.2.3 From 9e2b9f37760e129cee053cc7b6e7288acc2a7134 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 8 Feb 2015 19:21:27 -0500 Subject: SUNRPC: Remove the redundant XPRT_CONNECTION_CLOSE flag Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 1 - net/sunrpc/xprtsock.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ff3574df8344..e3015aede0d9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -683,7 +683,6 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); queue_work(rpciod_workqueue, &xprt->task_cleanup); return; out_abort: diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0279e8ffb14a..c72b13e2bdf5 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -804,7 +804,6 @@ static void xs_error_report(struct sock *sk) static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { smp_mb__before_atomic(); - clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_atomic(); -- cgit v1.2.3 From b70ae915e4282854fb7864519e5ec559ab2de7c3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Feb 2015 09:41:32 -0500 Subject: SUNRPC: Handle connection reset more efficiently. If the connection reset is due to an active call on our side, then the state change is sometimes not reported. Catch those instances using xs_error_report() instead. Also remove the xs_tcp_shutdown() call in xs_tcp_send_request() as the change in behaviour makes it redundant. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c72b13e2bdf5..540d542d85e5 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -718,7 +718,6 @@ static int xs_tcp_send_request(struct rpc_task *task) dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); case -ECONNRESET: - xs_tcp_shutdown(xprt); case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: @@ -774,6 +773,21 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } +static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) +{ + smp_mb__before_atomic(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + clear_bit(XPRT_CLOSING, &xprt->state); + smp_mb__after_atomic(); +} + +static void xs_sock_mark_closed(struct rpc_xprt *xprt) +{ + xs_sock_reset_connection_flags(xprt); + /* Mark transport as closed and wake up all pending tasks */ + xprt_disconnect_done(xprt); +} + /** * xs_error_report - callback to handle TCP socket state errors * @sk: socket @@ -793,6 +807,9 @@ static void xs_error_report(struct sock *sk) err = -sk->sk_err; if (err == 0) goto out; + /* Is this a reset event? */ + if (sk->sk_state == TCP_CLOSE) + xs_sock_mark_closed(xprt); dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -err); trace_rpc_socket_error(xprt, sk->sk_socket, err); @@ -801,14 +818,6 @@ static void xs_error_report(struct sock *sk) read_unlock_bh(&sk->sk_callback_lock); } -static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) -{ - smp_mb__before_atomic(); - clear_bit(XPRT_CLOSE_WAIT, &xprt->state); - clear_bit(XPRT_CLOSING, &xprt->state); - smp_mb__after_atomic(); -} - static void xs_reset_transport(struct sock_xprt *transport) { struct socket *sock = transport->sock; @@ -1421,13 +1430,6 @@ out: read_unlock_bh(&sk->sk_callback_lock); } -static void xs_sock_mark_closed(struct rpc_xprt *xprt) -{ - xs_sock_reset_connection_flags(xprt); - /* Mark transport as closed and wake up all pending tasks */ - xprt_disconnect_done(xprt); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed -- cgit v1.2.3 From 54c09874929dcaac37ed62ad2eca45d960ba1a00 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Feb 2015 11:01:02 -0500 Subject: SUNRPC: Define xs_tcp_fin_timeout only if CONFIG_SUNRPC_DEBUG Now that the linger code is gone, the xs_tcp_fin_timeout variable has no real function. Keep it for now, since it is part of the /proc interface, but only define it if that /proc interface is enabled. Suggested-by: Anna Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 540d542d85e5..8ab02262c761 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -63,6 +63,8 @@ static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + #define XS_TCP_LINGER_TO (15U * HZ) static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; @@ -75,8 +77,6 @@ static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; * someone else's file names! */ -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; static unsigned int max_tcp_slot_table_limit = RPC_MAX_SLOT_TABLE_LIMIT; -- cgit v1.2.3 From 402e23b4ed9ed81852b6c15b793fcf84ea91e491 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Feb 2015 17:20:14 -0500 Subject: SUNRPC: Fix stupid typo in xs_sock_set_reuseport Yes, kernel_setsockopt() hates you for using a char argument. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8ab02262c761..19f7526f8965 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1629,9 +1629,10 @@ static unsigned short xs_get_random_port(void) */ static void xs_sock_set_reuseport(struct socket *sock) { - char opt = 1; + int opt = 1; - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); + kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, + (char *)&opt, sizeof(opt)); } static unsigned short xs_sock_getport(struct socket *sock) -- cgit v1.2.3 From c627d31ba0696cbd829437af2be2f2dee3546b1e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Feb 2015 11:06:04 -0500 Subject: SUNRPC: Cleanup to remove xs_tcp_close() xs_tcp_close() is now just a call to xs_tcp_shutdown(), so remove it, and replace the entry in xs_tcp_ops. Suggested-by: Anna Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 19f7526f8965..66891e32c5e3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -863,11 +863,6 @@ static void xs_close(struct rpc_xprt *xprt) xprt_disconnect_done(xprt); } -static void xs_tcp_close(struct rpc_xprt *xprt) -{ - xs_tcp_shutdown(xprt); -} - static void xs_xprt_free(struct rpc_xprt *xprt) { xs_free_peer_addresses(xprt); @@ -2500,7 +2495,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .buf_free = rpc_free, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, - .close = xs_tcp_close, + .close = xs_tcp_shutdown, .destroy = xs_destroy, .print_stats = xs_tcp_print_stats, }; -- cgit v1.2.3