From ce8e04f6e5d3b2d14cd00cc4c0b1cc8cbdcf4d12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Feb 2023 08:22:17 -0700 Subject: io_uring: consolidate the put_ref-and-return section of adding work We've got a few cases of this, move them to one section and just use gotos to get there. Reduces the text section on both arm64 and x86-64, using gcc-12.2. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3b915deb4d08..cbe06deb84ff 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1285,17 +1285,15 @@ static void io_req_local_work_add(struct io_kiocb *req) percpu_ref_get(&ctx->refs); - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { - percpu_ref_put(&ctx->refs); - return; - } + if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) + goto put_ref; + /* needed for the following wake up */ smp_mb__after_atomic(); if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { io_move_task_work_from_local(ctx); - percpu_ref_put(&ctx->refs); - return; + goto put_ref; } if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) @@ -1305,6 +1303,8 @@ static void io_req_local_work_add(struct io_kiocb *req) if (READ_ONCE(ctx->cq_waiting)) wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); + +put_ref: percpu_ref_put(&ctx->refs); } -- cgit v1.2.3 From 8d664282a03fec09682f10252d3c785c2513691d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Feb 2023 08:27:23 -0700 Subject: io_uring: rename 'in_idle' to 'in_cancel' This better describes what it does - it's incremented when the task is currently undergoing a cancelation operation, due to exiting or exec'ing. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 18 +++++++++--------- io_uring/tctx.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0efe4d784358..00689c12f6ab 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -58,7 +58,7 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; - atomic_t in_idle; + atomic_t in_cancel; atomic_t inflight_tracked; struct percpu_counter inflight; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cbe06deb84ff..64e07df034d1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -719,7 +719,7 @@ static void io_put_task_remote(struct task_struct *task, int nr) struct io_uring_task *tctx = task->io_uring; percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) + if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); put_task_struct_many(task, nr); } @@ -1258,8 +1258,8 @@ void tctx_task_work(struct callback_head *cb) ctx_flush_and_put(ctx, &uring_locked); - /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) + /* relaxed read is enough as only the task itself sets ->in_cancel */ + if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); trace_io_uring_task_work_run(tctx, count, loops); @@ -1291,7 +1291,7 @@ static void io_req_local_work_add(struct io_kiocb *req) /* needed for the following wake up */ smp_mb__after_atomic(); - if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { + if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { io_move_task_work_from_local(ctx); goto put_ref; } @@ -2937,12 +2937,12 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) work = container_of(cb, struct io_tctx_exit, task_work); /* - * When @in_idle, we're in cancellation and it's racy to remove the + * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ - if (tctx && !atomic_read(&tctx->in_idle)) + if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } @@ -3210,7 +3210,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - atomic_inc(&tctx->in_idle); + atomic_inc(&tctx->in_cancel); do { bool loop = false; @@ -3261,9 +3261,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (cancel_all) { /* * We shouldn't run task_works after cancel, so just leave - * ->in_idle set for normal exit. + * ->in_cancel set for normal exit. */ - atomic_dec(&tctx->in_idle); + atomic_dec(&tctx->in_cancel); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 4324b1cf1f6a..3a8d1dd97e1b 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; init_llist_head(&tctx->task_list); -- cgit v1.2.3 From 6bf65a1b3668b04bb6c8126494d00303104eb9e5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Feb 2023 14:13:52 +0000 Subject: io_uring/rsrc: fix a comment in io_import_fixed() io_import_fixed() supports offsets, but "may not" means the opposite. Replace it with "might not" so the comments rather speaks about possible cases. Signed-off-by: Pavel Begunkov Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/5b5f79958456caa6dc532f6205f75f224b232c81.1676902343.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a59fc02de598..4e6191904c9e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1335,7 +1335,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return -EFAULT; /* - * May not be a start of buffer, set size appropriately + * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; -- cgit v1.2.3 From 48ba08374e779421ca34bd14b4834aae19fc3e6a Mon Sep 17 00:00:00 2001 From: Wojciech Lukowicz Date: Sat, 18 Feb 2023 18:41:41 +0000 Subject: io_uring: fix size calculation when registering buf ring Using struct_size() to calculate the size of io_uring_buf_ring will sum the size of the struct and of the bufs array. However, the struct's fields are overlaid with the array making the calculated size larger than it should be. When registering a ring with N * PAGE_SIZE / sizeof(struct io_uring_buf) entries, i.e. with fully filled pages, the calculated size will span one more page than it should and io_uring will try to pin the following page. Depending on how the application allocated the ring, it might succeed using an unrelated page or fail returning EFAULT. The size of the ring should be the product of ring_entries and the size of io_uring_buf, i.e. the size of the bufs array only. Fixes: c7fb19428d67 ("io_uring: add support for ring mapped supplied buffers") Signed-off-by: Wojciech Lukowicz Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20230218184141.70891-1-wlukowicz01@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4a6401080c1f..3002dc827195 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -505,7 +505,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), + flex_array_size(br, bufs, reg.ring_entries), &nr_pages); if (IS_ERR(pages)) { kfree(free_bl); -- cgit v1.2.3 From 9a1563d1720680bdc1d702486b7b73f51c079b32 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 22 Feb 2023 14:32:43 +0000 Subject: io_uring: remove unused wq_list_merge There are no users of wq_list_merge, kill it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5f9ad0301949213230ad9000a8359d591aae615a.1677002255.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/slist.h | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/io_uring/slist.h b/io_uring/slist.h index f27601fa4660..7c198a40d5f1 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -27,28 +27,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, list->last = node; } -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { -- cgit v1.2.3 From edd478269640b360c6f301f2baa04abdda563ef3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 22 Feb 2023 14:36:48 +0000 Subject: io_uring/rsrc: disallow multi-source reg buffers If two or more mappings go back to back to each other they can be passed into io_uring to be registered as a single registered buffer. That would even work if mappings came from different sources, e.g. it's possible to mix in this way anon pages and pages from shmem or hugetlb. That is not a problem but it'd rather be less prone if we forbid such mixing. Cc: Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4e6191904c9e..53845e496881 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { + struct file *file = vmas[0]->vm_file; + /* don't support file backed memory */ for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) + if (vmas[i]->vm_file != file) { + ret = -EINVAL; + break; + } + if (!file) continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { + if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { ret = -EOPNOTSUPP; break; } -- cgit v1.2.3 From b000ae0ec2d709046ac1a3c5722fea417f8a067e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 22 Feb 2023 14:36:50 +0000 Subject: io_uring/rsrc: optimise single entry advance Iterating within the first bvec entry should be essentially free, but we use iov_iter_advance() for that, which shows up in benchmark profiles taking up to 0.5% of CPU. Replace it with a hand coded version. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 53845e496881..ebbd2cea7582 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1364,7 +1364,10 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); + iter->bvec = bvec; + iter->nr_segs = bvec->bv_len; + iter->count -= offset; + iter->iov_offset = offset; } else { unsigned long seg_skip; -- cgit v1.2.3 From 57bebf807e2abcf87d96b9de1266104ee2d8fc2f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 22 Feb 2023 14:36:51 +0000 Subject: io_uring/rsrc: optimise registered huge pages When registering huge pages, internally io_uring will split them into many PAGE_SIZE bvec entries. That's bad for performance as drivers need to eventually dma-map the data and will do it individually for each bvec entry. Coalesce huge pages into one large bvec. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index ebbd2cea7582..aab1bc6883e9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1210,6 +1210,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; + struct folio *folio; *pimu = ctx->dummy_ubuf; if (!iov->iov_base) @@ -1224,6 +1225,21 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } + /* If it's a huge page, try to coalesce them into a single bvec entry */ + if (nr_pages > 1) { + folio = page_folio(pages[0]); + for (i = 1; i < nr_pages; i++) { + if (page_folio(pages[i]) != folio) { + folio = NULL; + break; + } + } + if (folio) { + folio_put_refs(folio, nr_pages - 1); + nr_pages = 1; + } + } + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; @@ -1236,6 +1252,17 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; + /* store original address for later verification */ + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->nr_bvecs = nr_pages; + *pimu = imu; + ret = 0; + + if (folio) { + bvec_set_page(&imu->bvec[0], pages[0], size, off); + goto done; + } for (i = 0; i < nr_pages; i++) { size_t vec_len; @@ -1244,12 +1271,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, off = 0; size -= vec_len; } - /* store original address for later verification */ - imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; - imu->nr_bvecs = nr_pages; - *pimu = imu; - ret = 0; done: if (ret) kvfree(imu); @@ -1364,6 +1385,11 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset <= bvec->bv_len) { + /* + * Note, huge pages buffers consists of one large + * bvec entry and should always go this way. The other + * branch doesn't expect non PAGE_SIZE'd chunks. + */ iter->bvec = bvec; iter->nr_segs = bvec->bv_len; iter->count -= offset; -- cgit v1.2.3 From 977bc87356107fb946fb4ff24f1e4c241b5043ec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Feb 2023 09:54:57 -0700 Subject: io_uring/rsrc: always initialize 'folio' to NULL Smatch complains that: smatch warnings: io_uring/rsrc.c:1262 io_sqe_buffer_register() error: uninitialized symbol 'folio'. 'folio' may be used uninitialized, which can happen if we end up with a single page mapped. Ensure that we clear folio to NULL at the top so it's always set. Reported-by: kernel test robot Reported-by: Dan Carpenter Link: https://lore.kernel.org/r/202302241432.YML1CD5C-lkp@intel.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index aab1bc6883e9..056f40946ff6 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1210,7 +1210,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; - struct folio *folio; + struct folio *folio = NULL; *pimu = ctx->dummy_ubuf; if (!iov->iov_base) -- cgit v1.2.3 From 7605c43d67face310b4b87dee1a28bc0c8cd8c0f Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Fri, 24 Feb 2023 16:01:24 +0100 Subject: io_uring: remove MSG_NOSIGNAL from recvmsg MSG_NOSIGNAL is not applicable for the receiving side, SIGPIPE is generated when trying to write to a "broken pipe". AF_PACKET's packet_recvmsg() does enforce this, giving back EINVAL when MSG_NOSIGNAL is set - making it unuseable in io_uring's recvmsg. Remove MSG_NOSIGNAL from io_recvmsg_prep(). Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: David Lamparter Cc: Eric Dumazet Cc: Jens Axboe Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230224150123.128346-1-equinox@diac24.net Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index cbd4b725f58c..b7f190ca528e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~(RECVMSG_FLAGS)) return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) -- cgit v1.2.3 From c16bda37594f83147b167d381d54c010024efecf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 25 Feb 2023 12:53:53 -0700 Subject: io_uring/poll: allow some retries for poll triggering spuriously If we get woken spuriously when polling and fail the operation with -EAGAIN again, then we generally only allow polling again if data had been transferred at some point. This is indicated with REQ_F_PARTIAL_IO. However, if the spurious poll triggers when the socket was originally empty, then we haven't transferred data yet and we will fail the poll re-arm. This either punts the socket to io-wq if it's blocking, or it fails the request with -EAGAIN if not. Neither condition is desirable, as the former will slow things down, while the latter will make the application confused. We want to ensure that a repeated poll trigger doesn't lead to infinite work making no progress, that's what the REQ_F_PARTIAL_IO check was for. But it doesn't protect against a loop post the first receive, and it's unnecessarily strict if we started out with an empty socket. Add a somewhat random retry count, just to put an upper limit on the potential number of retries that will be done. This should be high enough that we won't really hit it in practice, unless something needs to be aborted anyway. Cc: stable@vger.kernel.org # v5.10+ Link: https://github.com/axboe/liburing/issues/364 Signed-off-by: Jens Axboe --- io_uring/poll.c | 14 ++++++++++++-- io_uring/poll.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 8339a92b4510..a82d6830bdfd 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -650,6 +650,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } +/* + * We can't reliably detect loops in repeated poll triggers and issue + * subsequently failing. But rather than fail these immediately, allow a + * certain amount of retries before we give up. Given that this condition + * should _rarely_ trigger even once, we should be fine with a larger value. + */ +#define APOLL_MAX_RETRY 128 + static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { @@ -665,14 +673,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (entry == NULL) goto alloc_apoll; apoll = container_of(entry, struct async_poll, cache); + apoll->poll.retries = APOLL_MAX_RETRY; } else { alloc_apoll: apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return NULL; + apoll->poll.retries = APOLL_MAX_RETRY; } apoll->double_poll = NULL; req->apoll = apoll; + if (unlikely(!--apoll->poll.retries)) + return NULL; return apoll; } @@ -694,8 +706,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_ABORTED; if (!file_can_poll(req->file)) return IO_APOLL_ABORTED; - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) - return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; diff --git a/io_uring/poll.h b/io_uring/poll.h index 5f3bae50fc81..b2393b403a2c 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -12,6 +12,7 @@ struct io_poll { struct file *file; struct wait_queue_head *head; __poll_t events; + int retries; struct wait_queue_entry wait; }; -- cgit v1.2.3 From 54aa7f2330b82884f4a1afce0220add6e8312f8b Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 28 Feb 2023 12:54:59 +0800 Subject: io_uring: fix fget leak when fs don't support nowait buffered read Heming reported a BUG when using io_uring doing link-cp on ocfs2. [1] Do the following steps can reproduce this BUG: mount -t ocfs2 /dev/vdc /mnt/ocfs2 cp testfile /mnt/ocfs2/ ./link-cp /mnt/ocfs2/testfile /mnt/ocfs2/testfile.1 umount /mnt/ocfs2 Then umount will fail, and it outputs: umount: /mnt/ocfs2: target is busy. While tracing umount, it blames mnt_get_count() not return as expected. Do a deep investigation for fget()/fput() on related code flow, I've finally found that fget() leaks since ocfs2 doesn't support nowait buffered read. io_issue_sqe |-io_assign_file // do fget() first |-io_read |-io_iter_do_read |-ocfs2_file_read_iter // return -EOPNOTSUPP |-kiocb_done |-io_rw_done |-__io_complete_rw_common // set REQ_F_REISSUE |-io_resubmit_prep |-io_req_prep_async // override req->file, leak happens This was introduced by commit a196c78b5443 in v5.18. Fix it by don't re-assign req->file if it has already been assigned. [1] https://lore.kernel.org/ocfs2-devel/ab580a75-91c8-d68a-3455-40361be1bfa8@linux.alibaba.com/T/#t Fixes: a196c78b5443 ("io_uring: assign non-fixed early for async work") Cc: Reported-by: Heming Zhao Signed-off-by: Joseph Qi Cc: Xiaoguang Wang Link: https://lore.kernel.org/r/20230228045459.13524-1-joseph.qi@linux.alibaba.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 64e07df034d1..7625597b5227 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1777,7 +1777,7 @@ int io_req_prep_async(struct io_kiocb *req) const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) req->file = io_file_get_normal(req, req->cqe.fd); if (!cdef->prep_async) return 0; -- cgit v1.2.3 From 1947ddf9b3d5b886ba227bbfd3d6f501af08b5b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Feb 2023 09:41:20 -0700 Subject: io_uring/poll: don't pass in wake func to io_init_poll_iocb() We only use one, and it's io_poll_wake(). Hardwire that in the initial init, as well as in __io_queue_proc() if we're setting up for double poll. Signed-off-by: Jens Axboe --- io_uring/poll.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index a82d6830bdfd..795facbd0e9f 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -51,6 +51,9 @@ struct io_poll_table { #define IO_WQE_F_DOUBLE 1 +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key); + static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; @@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) } } -static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, - wait_queue_func_t wake_func) +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); } static inline void io_poll_remove_entry(struct io_poll *poll) @@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; - io_init_poll_iocb(poll, first->events, first->wait.func); + io_init_poll_iocb(poll, first->events); if (!io_poll_double_prepare(req)) { /* the request is completing, just back off */ kfree(poll); @@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, INIT_HLIST_NODE(&req->hash_node); req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); + io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; -- cgit v1.2.3