diff options
Diffstat (limited to 'fs')
154 files changed, 3795 insertions, 3015 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 32dff7ba3dda..21e154516bf2 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || (SUPERH && !MMU)) + depends on ARM || ((M68K || SUPERH) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 94aa7356248e..79f6b74336d2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -463,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, } /* skip if starts before the current position */ - if (offset < curr) + if (offset < curr) { + if (next > curr) + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 30b066299d39..65b439cd53d2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -25,6 +25,9 @@ #include "internal.h" #include "afs_fs.h" +// Temporary: netfs does disgusting things with inode pointers +#pragma GCC diagnostic ignored "-Wattribute-warning" + static const struct inode_operations afs_symlink_inode_operations = { .get_link = page_get_link, }; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7584aa6e5025..e5221be6eb55 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct iov_iter iter; ssize_t err = 0; size_t len; + int mode; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) goto out; /* We need to fetch the inline data. */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -604,8 +606,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); - if (IS_ERR(req)) + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); return PTR_ERR(req); + } set_page_writeback(page); if (caching) @@ -1644,7 +1648,7 @@ int ceph_uninline_data(struct file *file) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; @@ -1652,10 +1656,23 @@ int ceph_uninline_data(struct file *file) int err = 0; u64 len; + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == CEPH_INLINE_NONE) + return 0; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + folio = read_mapping_folio(inode->i_mapping, 0, file); if (IS_ERR(folio)) { err = PTR_ERR(folio); @@ -1664,17 +1681,6 @@ int ceph_uninline_data(struct file *file) folio_lock(folio); - spin_lock(&ci->i_ceph_lock); - inline_version = ci->i_inline_version; - spin_unlock(&ci->i_ceph_lock); - - dout("uninline_data %p %llx.%llx inline_version %llu\n", - inode, ceph_vinop(inode), inline_version); - - if (inline_version == 1 || /* initial version, no data */ - inline_version == CEPH_INLINE_NONE) - goto out_unlock; - len = i_size_read(inode); if (len > folio_size(folio)) len = folio_size(folio); @@ -1739,6 +1745,7 @@ int ceph_uninline_data(struct file *file) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); +out_uninline: if (!err) { int dirty; @@ -1757,8 +1764,10 @@ out_put_req: if (err == -ECANCELED) err = 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } out: ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", @@ -1777,7 +1786,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) if (!mapping->a_ops->read_folio) return -ENOEXEC; - file_accessed(file); vma->vm_ops = &ceph_vmops; return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5c14ef04e474..bf2e94005598 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; - struct ceph_cap_flush *cf; + struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { @@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, } ret = -ENOENT; - list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { - if (cf->tid >= first_tid) { + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { + if (iter->tid >= first_tid) { + cf = iter; ret = 0; break; } @@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; + bool queue_writeback = false; if (session) ceph_get_mds_session(session); @@ -2062,10 +2064,27 @@ retry: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & cap_used) == 0) { - dout("completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; + if (revoking) { + if ((revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; } /* want more caps from mds? */ @@ -2135,6 +2154,8 @@ ack: spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); + if (queue_writeback) + ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } @@ -2218,9 +2239,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) } /* - * wait for any unsafe requests to complete. + * flush the mdlog and wait for any unsafe requests to complete. */ -static int unsafe_request_wait(struct inode *inode) +static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -2336,7 +2357,7 @@ retry: kfree(sessions); } - dout("unsafe_request_wait %p wait on tid %llu %llu\n", + dout("%s %p wait on tid %llu %llu\n", __func__, inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, @@ -2380,7 +2401,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - err = unsafe_request_wait(inode); + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds @@ -3182,10 +3203,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap = NULL; + struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; - bool found = false; bool flush_snaps = false; bool complete_capsnap = false; @@ -3212,14 +3232,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->context == snapc) { - found = true; + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->context == snapc) { + capsnap = iter; break; } } - if (!found) { + if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. @@ -3769,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); - struct ceph_cap_snap *capsnap; - bool flushed = false; + struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; @@ -3778,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, inode, ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->follows == follows) { - if (capsnap->cap_flush.tid != flush_tid) { + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->follows == follows) { + if (iter->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", capsnap, follows, - flush_tid, capsnap->cap_flush.tid); + " %lld\n", iter, follows, + flush_tid, iter->cap_flush.tid); break; } - flushed = true; + capsnap = iter; break; } else { dout(" skipping cap_snap %p follows %lld\n", - capsnap, capsnap->follows); + iter, iter->follows); } } - if (flushed) + if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); - if (flushed) { + if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 63113e2a4890..5f1cc2b4ed06 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -20,6 +20,9 @@ #include "cache.h" #include <linux/ceph/decode.h> +// Temporary: netfs does disgusting things with inode pointers +#pragma GCC diagnostic ignored "-Wattribute-warning" + /* * Ceph inode operations * @@ -578,7 +581,7 @@ void ceph_evict_inode(struct inode *inode) __ceph_remove_caps(ci); - if (__ceph_has_any_quota(ci)) + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) ceph_adjust_quota_realms_count(inode, false); /* @@ -1466,10 +1469,12 @@ retry_lookup: } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); + } + + if (!d_unhashed(dn) && have_lease) update_dentry_lease(dir, dn, rinfo->dlease, session, req->r_request_started); - } goto done; } @@ -1884,7 +1889,6 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); @@ -2258,6 +2262,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & CEPH_STAT_RSTAT)) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. @@ -2281,7 +2309,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + mode = ceph_try_to_choose_auth_mds(inode, mask); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); @@ -2423,7 +2451,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return -ESTALE; /* Skip the getattr altogether if we're asked not to sync */ - if (!(flags & AT_STATX_DONT_SYNC)) { + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { err = ceph_do_getattr(inode, statx_to_caps(request_mask, inode->i_mode), flags & AT_STATX_FORCE_SYNC); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 00c3de177dd6..f5d110d90b77 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_32_safe(p, end, sets, bad); dout("got %u sets of delegated inodes\n", sets); while (sets--) { - u64 start, len, ino; + u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); @@ -449,7 +449,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, continue; } while (len--) { - int err = xa_insert(&s->s_delegated_inos, ino = start++, + int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { @@ -2651,7 +2651,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; - int flags = 0; + int flags = 0, max_retry; + + /* + * The type of 'r_attempts' in kernel 'ceph_mds_request' + * is 'int', while in 'ceph_mds_request_head' the type of + * 'num_retry' is '__u8'. So in case the request retries + * exceeding 256 times, the MDS will receive a incorrect + * retry seq. + * + * In this case it's ususally a bug in MDS and continue + * retrying the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); + max_retry = 1 << (max_retry * BITS_PER_BYTE); + if (req->r_attempts >= max_retry) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } req->r_attempts++; if (req->r_inode) { @@ -2663,7 +2684,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, else req->r_sent_on_mseq = -1; } - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { @@ -3265,6 +3286,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; + bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); @@ -3273,16 +3295,41 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { + mutex_unlock(&mdsc->mutex); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); - goto out; /* dup reply? */ + return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); + /* + * The type of 'num_fwd' in ceph 'MClientRequestForward' + * is 'int32_t', while in 'ceph_mds_request_head' the + * type is '__u8'. So in case the request bounces between + * MDSes exceeding 256 times, the client will get stuck. + * + * In this case it's ususally a bug in MDS and continue + * bouncing the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + int max = sizeof_field(struct ceph_mds_request_head, num_fwd); + max = 1 << (max * BITS_PER_BYTE); + if (req->r_num_fwd >= max) { + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", + tid); + } else { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -3294,9 +3341,12 @@ static void handle_forward(struct ceph_mds_client *mdsc, put_request_session(req); __do_request(mdsc, req); } - ceph_mdsc_put_request(req); -out: mutex_unlock(&mdsc->mutex); + + /* kick calling process */ + if (aborted) + complete_request(mdsc, req); + ceph_mdsc_put_request(req); return; bad: @@ -3375,13 +3425,17 @@ static void handle_session(struct ceph_mds_session *session, } if (msg_version >= 5) { - u32 flags; - /* version >= 4, struct_v, struct_cv, len, metric_spec */ - ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); + u32 flags, len; + + /* version >= 4 */ + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ + ceph_decode_32_safe(&p, end, len, bad); /* len */ + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ + /* version >= 5, flags */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { - pr_warn("mds%d session blocklisted\n", session->s_mds); + pr_warn("mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } @@ -4396,12 +4450,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); - /* - * if this is a preemptive lease RELEASE, no need to - * flush request stream, since the actual request will - * soon follow. - */ - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); ceph_con_send(&session->s_con, msg); } @@ -4696,15 +4744,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) } /* - * wait for all write mds requests to flush. + * flush the mdlog and wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, + u64 want_tid) { struct ceph_mds_request *req = NULL, *nextreq; + struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("wait_unsafe_requests want %lld\n", want_tid); + dout("%s want %lld\n", __func__, want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -4716,14 +4766,32 @@ restart: nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_mds_session *s = req->r_session; + + if (!s) { + req = nextreq; + continue; + } + /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); + s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", + + /* send flush mdlog request to MDS */ + if (last_session != s) { + send_flush_mdlog(s); + ceph_put_mds_session(last_session); + last_session = s; + } else { + ceph_put_mds_session(s); + } + dout("%s wait on %llu (want %llu)\n", __func__, req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) @@ -4738,7 +4806,8 @@ restart: req = nextreq; } mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests done\n"); + ceph_put_mds_session(last_session); + dout("%s done\n", __func__); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) @@ -4767,7 +4836,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); - wait_unsafe_requests(mdsc, want_tid); + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 33497846e47e..1140aecd82ce 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -579,7 +579,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, - TASK_INTERRUPTIBLE); + TASK_KILLABLE); } extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index a338a3ec0dc4..64592adfe48f 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) /* * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (either max_files - * or max_bytes). If the root is reached, return the root ceph_snap_realm - * instead. + * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * max_bytes, or any, depending on the 'which_quota' argument). If the root is + * reached, return the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, bool retry) + struct inode *inode, + enum quota_get_realm which_quota, + bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -248,7 +250,7 @@ restart: } ci = ceph_inode(in); - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, which_quota); iput(in); next = realm->parent; @@ -279,8 +281,8 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, true); - new_realm = get_quota_realm(mdsc, new, false); + old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); + new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); if (PTR_ERR(new_realm) == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) @@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), + QUOTA_GET_MAX_BYTES, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e6987d295079..b73b4f75462c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1119,6 +1119,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; + s->s_flags |= SB_NODIRATIME | SB_NOATIME; ret = set_anon_super_fc(s, fc); if (ret != 0) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20ceab74e871..dd7dac0f984a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1022,6 +1022,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode) ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } +extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) @@ -1278,9 +1279,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ -static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) + +enum quota_get_realm { + QUOTA_GET_MAX_FILES, + QUOTA_GET_MAX_BYTES, + QUOTA_GET_ANY +}; + +static inline bool __ceph_has_quota(struct ceph_inode_info *ci, + enum quota_get_realm which) { - return ci->i_max_files || ci->i_max_bytes; + bool has_quota = false; + + switch (which) { + case QUOTA_GET_MAX_BYTES: + has_quota = !!ci->i_max_bytes; + break; + case QUOTA_GET_MAX_FILES: + has_quota = !!ci->i_max_files; + break; + default: + has_quota = !!(ci->i_max_files || ci->i_max_bytes); + } + return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); @@ -1289,10 +1310,10 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; - had_quota = __ceph_has_any_quota(ci); + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index afec84088471..8c2dc2c762a4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, } #define XATTR_RSTAT_FIELD(_type, _name) \ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ + } #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ @@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rsubdirs), XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), - XATTR_RSTAT_FIELD(dir, rctime), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), { .name = "ceph.dir.pin", .name_size = sizeof("ceph.dir.pin"), diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index cc8fdcb35b71..8c9f2c00be72 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \ cifs_unicode.o nterr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ + readdir.o ioctl.o sess.o export.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o @@ -30,3 +30,5 @@ cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o + +cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 180c234c2f46..1e4c7cc5287f 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -465,7 +465,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a int ret = 0; /* Store the reconnect address */ - mutex_lock(&tcon->ses->server->srv_mutex); + cifs_server_lock(tcon->ses->server); if (cifs_sockaddr_equal(&tcon->ses->server->dstaddr, addr)) goto unlock; @@ -501,7 +501,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a cifs_signal_cifsd_for_reconnect(tcon->ses->server, false); unlock: - mutex_unlock(&tcon->ses->server->srv_mutex); + cifs_server_unlock(tcon->ses->server); return ret; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 0912d8bbbac1..663cb9db4908 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -236,9 +236,9 @@ int cifs_verify_signature(struct smb_rqst *rqst, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = cifs_calc_signature(rqst, server, what_we_think_sig_should_be); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc) return rc; @@ -626,7 +626,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) memcpy(ses->auth_key.response + baselen, tiblob, tilen); - mutex_lock(&ses->server->srv_mutex); + cifs_server_lock(ses->server); rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5, @@ -678,7 +678,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); unlock: - mutex_unlock(&ses->server->srv_mutex); + cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: kfree(tiblob); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f539a39d47f5..12c872800326 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -838,7 +838,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, int flags, struct smb3_fs_context *old_ctx) { int rc; - struct super_block *sb; + struct super_block *sb = NULL; struct cifs_sb_info *cifs_sb = NULL; struct cifs_mnt_data mnt_data; struct dentry *root; @@ -934,9 +934,11 @@ out_super: return root; out: if (cifs_sb) { - kfree(cifs_sb->prepath); - smb3_cleanup_fs_context(cifs_sb->ctx); - kfree(cifs_sb); + if (!sb || IS_ERR(sb)) { /* otherwise kill_sb will handle */ + kfree(cifs_sb->prepath); + smb3_cleanup_fs_context(cifs_sb->ctx); + kfree(cifs_sb); + } } return root; } diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c0542bdcd06b..dd7e070ca243 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,6 +152,7 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define SMB3_PRODUCT_BUILD 35 -#define CIFS_VERSION "2.36" +/* when changing internal version - update following two lines at same time */ +#define SMB3_PRODUCT_BUILD 37 +#define CIFS_VERSION "2.37" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 68da230c7f11..f873379066c7 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,6 +16,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/utsname.h> +#include <linux/sched/mm.h> #include <linux/netfs.h> #include "cifs_fs_sb.h" #include "cifsacl.h" @@ -628,7 +629,8 @@ struct TCP_Server_Info { unsigned int in_flight; /* number of requests on the wire to server */ unsigned int max_in_flight; /* max number of requests that were on wire */ spinlock_t req_lock; /* protect the two values above */ - struct mutex srv_mutex; + struct mutex _srv_mutex; + unsigned int nofs_flag; struct task_struct *tsk; char server_GUID[16]; __u16 sec_mode; @@ -743,6 +745,22 @@ struct TCP_Server_Info { #endif }; +static inline void cifs_server_lock(struct TCP_Server_Info *server) +{ + unsigned int nofs_flag = memalloc_nofs_save(); + + mutex_lock(&server->_srv_mutex); + server->nofs_flag = nofs_flag; +} + +static inline void cifs_server_unlock(struct TCP_Server_Info *server) +{ + unsigned int nofs_flag = server->nofs_flag; + + mutex_unlock(&server->_srv_mutex); + memalloc_nofs_restore(nofs_flag); +} + struct cifs_credits { unsigned int value; unsigned int instance; @@ -1945,11 +1963,13 @@ extern mempool_t *cifs_mid_poolp; /* Operations for different SMB versions */ #define SMB1_VERSION_STRING "1.0" +#define SMB20_VERSION_STRING "2.0" +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; -#define SMB20_VERSION_STRING "2.0" extern struct smb_version_operations smb20_operations; extern struct smb_version_values smb20_values; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 53373a3649e1..d46702f5a663 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -148,7 +148,7 @@ static void cifs_resolve_server(struct work_struct *work) struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, resolve.work); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); /* * Resolve the hostname again to make sure that IP address is up-to-date. @@ -159,7 +159,7 @@ static void cifs_resolve_server(struct work_struct *work) __func__, rc); } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); } /* @@ -267,7 +267,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) /* do not want to be sending data on a socket we are freeing */ cifs_dbg(FYI, "%s: tearing down socket\n", __func__); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (server->ssocket) { cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state, server->ssocket->flags); @@ -296,7 +296,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) mid->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_entry_safe(mid, nmid, &retry_list, qhead) { @@ -306,9 +306,9 @@ cifs_abort_connection(struct TCP_Server_Info *server) } if (cifs_rdma_enabled(server)) { - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); smbd_destroy(server); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); } } @@ -359,7 +359,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, do { try_to_freeze(); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (!cifs_swn_set_server_dstaddr(server)) { /* resolve the hostname again to make sure that IP address is up-to-date */ @@ -372,7 +372,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, else rc = generic_ip_connect(server); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc); msleep(3000); } else { @@ -383,7 +383,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, server->tcpStatus = CifsNeedNegotiate; spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } } while (server->tcpStatus == CifsNeedReconnect); @@ -488,12 +488,12 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) do { try_to_freeze(); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = reconnect_target_unlocked(server, &tl, &target_hint); if (rc) { /* Failed to reconnect socket */ - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc); msleep(3000); continue; @@ -510,7 +510,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); @@ -1565,7 +1565,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); INIT_LIST_HEAD(&tcp_ses->pending_mid_q); - mutex_init(&tcp_ses->srv_mutex); + mutex_init(&tcp_ses->_srv_mutex); memcpy(tcp_ses->workstation_RFC1001_name, ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); memcpy(tcp_ses->server_RFC1001_name, @@ -1845,7 +1845,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) unsigned int rc, xid; unsigned int chan_count; struct TCP_Server_Info *server = ses->server; - cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); if (ses->ses_status == SES_EXITING) { diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index c5dd6f7305bd..34a8f3baed5e 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1229,6 +1229,30 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id) kref_put(&mg->refcount, mount_group_release); } +/* Extract share from DFS target and return a pointer to prefix path or NULL */ +static const char *parse_target_share(const char *target, char **share) +{ + const char *s, *seps = "/\\"; + size_t len; + + s = strpbrk(target + 1, seps); + if (!s) + return ERR_PTR(-EINVAL); + + len = strcspn(s + 1, seps); + if (!len) + return ERR_PTR(-EINVAL); + s += len; + + len = s - target + 1; + *share = kstrndup(target, len, GFP_KERNEL); + if (!*share) + return ERR_PTR(-ENOMEM); + + s = target + len; + return s + strspn(s, seps); +} + /** * dfs_cache_get_tgt_share - parse a DFS target * @@ -1242,56 +1266,46 @@ void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id) int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share, char **prefix) { - char *s, sep, *p; - size_t len; - size_t plen1, plen2; + char sep; + char *target_share; + char *ppath = NULL; + const char *target_ppath, *dfsref_ppath; + size_t target_pplen, dfsref_pplen; + size_t len, c; if (!it || !path || !share || !prefix || strlen(path) < it->it_path_consumed) return -EINVAL; - *share = NULL; - *prefix = NULL; - sep = it->it_name[0]; if (sep != '\\' && sep != '/') return -EINVAL; - s = strchr(it->it_name + 1, sep); - if (!s) - return -EINVAL; + target_ppath = parse_target_share(it->it_name, &target_share); + if (IS_ERR(target_ppath)) + return PTR_ERR(target_ppath); - /* point to prefix in target node */ - s = strchrnul(s + 1, sep); + /* point to prefix in DFS referral path */ + dfsref_ppath = path + it->it_path_consumed; + dfsref_ppath += strspn(dfsref_ppath, "/\\"); - /* extract target share */ - *share = kstrndup(it->it_name, s - it->it_name, GFP_KERNEL); - if (!*share) - return -ENOMEM; + target_pplen = strlen(target_ppath); + dfsref_pplen = strlen(dfsref_ppath); - /* skip separator */ - if (*s) - s++; - /* point to prefix in DFS path */ - p = path + it->it_path_consumed; - if (*p == sep) - p++; - - /* merge prefix paths from DFS path and target node */ - plen1 = it->it_name + strlen(it->it_name) - s; - plen2 = path + strlen(path) - p; - if (plen1 || plen2) { - len = plen1 + plen2 + 2; - *prefix = kmalloc(len, GFP_KERNEL); - if (!*prefix) { - kfree(*share); - *share = NULL; + /* merge prefix paths from DFS referral path and target node */ + if (target_pplen || dfsref_pplen) { + len = target_pplen + dfsref_pplen + 2; + ppath = kzalloc(len, GFP_KERNEL); + if (!ppath) { + kfree(target_share); return -ENOMEM; } - if (plen1) - scnprintf(*prefix, len, "%.*s%c%.*s", (int)plen1, s, sep, (int)plen2, p); - else - strscpy(*prefix, p, len); + c = strscpy(ppath, target_ppath, len); + if (c && dfsref_pplen) + ppath[c] = sep; + strlcat(ppath, dfsref_ppath, len); } + *share = target_share; + *prefix = ppath; return 0; } @@ -1327,9 +1341,9 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n", __func__, ip); } else { - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); } kfree(ip); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c6214cfc575f..3b7915af1f62 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1120,14 +1120,14 @@ sess_establish_session(struct sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; struct TCP_Server_Info *server = sess_data->server; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (!server->session_estab) { if (server->sign) { server->session_key.response = kmemdup(ses->auth_key.response, ses->auth_key.len, GFP_KERNEL); if (!server->session_key.response) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return -ENOMEM; } server->session_key.len = @@ -1136,7 +1136,7 @@ sess_establish_session(struct sess_data *sess_data) server->sequence_number = 0x2; server->session_estab = true; } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "CIFS session established successfully\n"); return 0; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index c71c9a44bef4..2e20ee4dab7b 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -38,10 +38,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst, in_buf->WordCount = 0; put_bcc(0, in_buf); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } @@ -55,7 +55,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst, if (rc < 0) server->sequence_number--; - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n", get_mid(in_buf), rc); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d7ade739cde1..98a76fa791c0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3859,7 +3859,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if (rc) goto out; - if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) + if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) smb2_set_sparse(xid, tcon, cfile, inode, false); eof = cpu_to_le64(off + len); @@ -4345,11 +4345,13 @@ smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, } } +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY static bool smb2_is_read_op(__u32 oplock) { return oplock == SMB2_OPLOCK_LEVEL_II; } +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ static bool smb21_is_read_op(__u32 oplock) @@ -5448,7 +5450,7 @@ out: return rc; } - +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -5547,6 +5549,7 @@ struct smb_version_operations smb20_operations = { .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, }; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ struct smb_version_operations smb21_operations = { .compare_fids = smb2_compare_fids, @@ -5878,6 +5881,7 @@ struct smb_version_operations smb311_operations = { .is_network_name_deleted = smb2_is_network_name_deleted, }; +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_version_values smb20_values = { .version_string = SMB20_VERSION_STRING, .protocol_id = SMB20_PROT_ID, @@ -5898,6 +5902,7 @@ struct smb_version_values smb20_values = { .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, .create_lease_size = sizeof(struct create_lease), }; +#endif /* ALLOW_INSECURE_LEGACY */ struct smb_version_values smb21_values = { .version_string = SMB21_VERSION_STRING, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 084be3a90198..0e8c85249579 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1369,13 +1369,13 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; struct TCP_Server_Info *server = sess_data->server; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); if (server->ops->generate_signingkey) { rc = server->ops->generate_signingkey(ses, server); if (rc) { cifs_dbg(FYI, "SMB3 session key generation failed\n"); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } } @@ -1383,7 +1383,7 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) server->sequence_number = 0x2; server->session_estab = true; } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); return rc; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index c3278db1cade..5fbbec22bcc8 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1382,9 +1382,9 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "freeing mr list\n"); wake_up_interruptible_all(&info->wait_mr); while (atomic_read(&info->mr_used_count)) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); msleep(1000); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); } destroy_mr_list(info); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 05eca41e3b1e..bfc9bd55870a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -822,7 +822,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, } else instance = exist_credits->instance; - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); /* * We can't use credits obtained from the previous session to send this @@ -830,14 +830,14 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, * return -EAGAIN in such cases to let callers handle it. */ if (instance != server->reconnect_instance) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); add_credits_and_wake_if(server, &credits, optype); return -EAGAIN; } mid = server->ops->setup_async_request(server, rqst); if (IS_ERR(mid)) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); add_credits_and_wake_if(server, &credits, optype); return PTR_ERR(mid); } @@ -868,7 +868,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_delete_mid(mid); } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc == 0) return 0; @@ -1109,7 +1109,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * of smb data. */ - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); /* * All the parts of the compound chain belong obtained credits from the @@ -1119,7 +1119,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, * handle it. */ if (instance != server->reconnect_instance) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); for (j = 0; j < num_rqst; j++) add_credits(server, &credits[j], optype); return -EAGAIN; @@ -1131,7 +1131,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, revert_current_mid(server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); /* Update # of requests on wire to server */ for (j = 0; j < num_rqst; j++) @@ -1163,7 +1163,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, server->sequence_number -= 2; } - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); /* * If sending failed for some reason or it is an oplock break that we @@ -1190,9 +1190,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if ((ses->ses_status == SES_NEW) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { spin_unlock(&cifs_tcp_ses_lock); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); smb311_update_preauth_hash(ses, server, rqst[0].rq_iov, rqst[0].rq_nvec); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); spin_lock(&cifs_tcp_ses_lock); } @@ -1266,9 +1266,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, .iov_len = resp_iov[0].iov_len }; spin_unlock(&cifs_tcp_ses_lock); - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); smb311_update_preauth_hash(ses, server, &iov, 1); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); spin_lock(&cifs_tcp_ses_lock); } spin_unlock(&cifs_tcp_ses_lock); @@ -1385,11 +1385,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, and avoid races inside tcp sendmsg code that could cause corruption of smb data */ - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); /* Update # of requests on wire to server */ add_credits(server, &credits, 0); return rc; @@ -1397,7 +1397,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); goto out; } @@ -1411,7 +1411,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) server->sequence_number -= 2; - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc < 0) goto out; @@ -1530,18 +1530,18 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, and avoid races inside tcp sendmsg code that could cause corruption of smb data */ - mutex_lock(&server->srv_mutex); + cifs_server_lock(server); rc = allocate_mid(ses, in_buf, &midQ); if (rc) { - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number); if (rc) { cifs_delete_mid(midQ); - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); return rc; } @@ -1554,7 +1554,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc < 0) server->sequence_number -= 2; - mutex_unlock(&server->srv_mutex); + cifs_server_unlock(server); if (rc < 0) { cifs_delete_mid(midQ); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index a5cc4ed2cd0d..8e01d89c3319 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -17,6 +17,7 @@ static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space rreq->start = start; rreq->len = len; rreq->mapping = mapping; + rreq->inode = mapping->host; INIT_LIST_HEAD(&rreq->subrequests); refcount_set(&rreq->ref, 1); return rreq; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index bcc8335b46b3..95a403720e8c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -288,7 +288,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - err = z_erofs_fill_inode(inode); + if (!erofs_is_fscache_mode(inode->i_sb)) + err = z_erofs_fill_inode(inode); + else + err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 95efc127b2ba..724bb57075f6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -199,7 +199,6 @@ struct z_erofs_decompress_frontend { struct z_erofs_pagevec_ctor vector; struct z_erofs_pcluster *pcl, *tailpcl; - struct z_erofs_collection *cl; /* a pointer used to pick up inplace I/O pages */ struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; @@ -214,7 +213,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED } + .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); @@ -357,7 +356,7 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, return false; } -/* callers must be with collection lock held */ +/* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct page *page, enum z_erofs_page_type type, bool pvec_safereuse) @@ -372,7 +371,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, pvec_safereuse); - fe->cl->vcnt += (unsigned int)ret; + fe->pcl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; } @@ -405,12 +404,11 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { struct z_erofs_pcluster *pcl = fe->pcl; - struct z_erofs_collection *cl; unsigned int length; /* to avoid unexpected loop formed by corrupted images */ @@ -419,8 +417,7 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, return -EFSCORRUPTED; } - cl = z_erofs_primarycollection(pcl); - if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -443,23 +440,21 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, length = READ_ONCE(pcl->length); } } - mutex_lock(&cl->lock); + mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) fe->tailpcl = pcl; z_erofs_try_to_claim_pcluster(fe); - fe->cl = cl; return 0; } -static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; - struct z_erofs_collection *cl; struct erofs_workgroup *grp; int err; @@ -482,17 +477,15 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; + pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = COLLECT_PRIMARY_FOLLOWED; - cl = z_erofs_primarycollection(pcl); - cl->pageofs = map->m_la & ~PAGE_MASK; - /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_init(&cl->lock); - DBG_BUGON(!mutex_trylock(&cl->lock)); + mutex_init(&pcl->lock); + DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ @@ -519,11 +512,10 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; - fe->cl = cl; return 0; err_out: - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); z_erofs_free_pcluster(pcl); return err; } @@ -535,9 +527,9 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, struct erofs_workgroup *grp; int ret; - DBG_BUGON(fe->cl); + DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); @@ -554,14 +546,14 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_collection(fe, inode, map); + ret = z_erofs_register_pcluster(fe, inode, map); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_collection(fe, inode, map); + ret = z_erofs_lookup_pcluster(fe, inode, map); if (ret) { erofs_workgroup_put(&fe->pcl->obj); return ret; @@ -569,7 +561,7 @@ tailpacking: out: z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->cl->pagevec, fe->cl->vcnt); + fe->pcl->pagevec, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -582,48 +574,36 @@ out: */ static void z_erofs_rcu_callback(struct rcu_head *head) { - struct z_erofs_collection *const cl = - container_of(head, struct z_erofs_collection, rcu); - - z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(head, + struct z_erofs_pcluster, rcu)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); - - call_rcu(&cl->rcu, z_erofs_rcu_callback); -} -static void z_erofs_collection_put(struct z_erofs_collection *cl) -{ - struct z_erofs_pcluster *const pcl = - container_of(cl, struct z_erofs_pcluster, primary_collection); - - erofs_workgroup_put(&pcl->obj); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) { - struct z_erofs_collection *cl = fe->cl; + struct z_erofs_pcluster *pcl = fe->pcl; - if (!cl) + if (!pcl) return false; z_erofs_pagevec_ctor_exit(&fe->vector, false); - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) - z_erofs_collection_put(cl); + erofs_workgroup_put(&pcl->obj); - fe->cl = NULL; + fe->pcl = NULL; return true; } @@ -663,28 +643,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, repeat: cur = end - 1; - /* lucky, within the range of the current map_blocks */ - if (offset + cur >= map->m_la && - offset + cur < map->m_la + map->m_llen) { - /* didn't get a valid collection previously (very rare) */ - if (!fe->cl) - goto restart_now; - goto hitted; - } - - /* go ahead the next map_blocks */ - erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); - - if (z_erofs_collector_end(fe)) - fe->backmost = false; + if (offset + cur < map->m_la || + offset + cur >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + cur); - map->m_la = offset + cur; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto err_out; + if (z_erofs_collector_end(fe)) + fe->backmost = false; + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto err_out; + } else { + if (fe->pcl) + goto hitted; + /* didn't get a valid pcluster previously (very rare) */ + } -restart_now: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; @@ -766,7 +741,7 @@ retry: /* bump up the number of spiltted parts of a page */ ++spiltted; /* also update nr_pages */ - fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1); + fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); next_part: /* can be used for verification */ map->m_llen = offset + cur - map->m_la; @@ -821,15 +796,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, enum z_erofs_page_type page_type; bool overlapped, partial; - struct z_erofs_collection *cl; int err; might_sleep(); - cl = z_erofs_primarycollection(pcl); - DBG_BUGON(!READ_ONCE(cl->nr_pages)); + DBG_BUGON(!READ_ONCE(pcl->nr_pages)); - mutex_lock(&cl->lock); - nr_pages = cl->nr_pages; + mutex_lock(&pcl->lock); + nr_pages = pcl->nr_pages; if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { pages = pages_onstack; @@ -857,9 +830,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, err = 0; z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, 0); + pcl->pagevec, 0); - for (i = 0; i < cl->vcnt; ++i) { + for (i = 0; i < pcl->vcnt; ++i) { unsigned int pagenr; page = z_erofs_pagevec_dequeue(&ctor, &page_type); @@ -945,11 +918,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, goto out; llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { outputsize = llen; partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); } else { - outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; partial = true; } @@ -963,7 +936,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, .in = compressed_pages, .out = pages, .pageofs_in = pcl->pageofs_in, - .pageofs_out = cl->pageofs, + .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, .outputsize = outputsize, .alg = pcl->algorithmformat, @@ -1012,16 +985,12 @@ out: else if (pages != pages_onstack) kvfree(pages); - cl->nr_pages = 0; - cl->vcnt = 0; + pcl->nr_pages = 0; + pcl->vcnt = 0; - /* all cl locks MUST be taken before the following line */ + /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); - - /* all cl locks SHOULD be released right now */ - mutex_unlock(&cl->lock); - - z_erofs_collection_put(cl); + mutex_unlock(&pcl->lock); return err; } @@ -1043,6 +1012,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(pcl->next); z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + erofs_workgroup_put(&pcl->obj); } } @@ -1466,22 +1436,19 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct page *page; page = erofs_grab_cache_page_nowait(inode->i_mapping, index); - if (!page) - goto skip; - - if (PageUptodate(page)) { - unlock_page(page); + if (page) { + if (PageUptodate(page)) { + unlock_page(page); + } else { + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + } put_page(page); - goto skip; } - err = z_erofs_do_read_page(f, page, pagepool); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - put_page(page); -skip: if (cur < PAGE_SIZE) break; cur = (index << PAGE_SHIFT) - 1; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 800b11c53f57..58053bb5066f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -12,21 +12,40 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_NR_INLINE_PAGEVECS 3 +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only * for everyone else; * - * L: Field should be protected by pageset lock; + * L: Field should be protected by the pcluster lock; * * A: Field should be accessed / updated in atomic for parallelized code. */ -struct z_erofs_collection { +struct z_erofs_pcluster { + struct erofs_workgroup obj; struct mutex lock; + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + /* I: page offset of start position of decompression */ - unsigned short pageofs; + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; /* L: maximum relative page index in pagevec[] */ unsigned short nr_pages; @@ -41,29 +60,6 @@ struct z_erofs_collection { /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; -}; - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct z_erofs_collection primary_collection; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* A: lower limit of decompressed length and if full length or not */ - unsigned int length; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; union { /* I: physical cluster size in pages */ @@ -80,8 +76,6 @@ struct z_erofs_pcluster { struct page *compressed_pages[]; }; -#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) - /* let's avoid the valid 32-bit kernel addresses */ /* the chained workgroup has't submitted io (still open) */ diff --git a/fs/exec.c b/fs/exec.c index 14b4b3755580..0989fb8472a1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1312,9 +1312,7 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - if (me->flags & PF_KTHREAD) - free_kthread_struct(me); - me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; @@ -1959,6 +1957,10 @@ int kernel_execve(const char *kernel_filename, int fd = AT_FDCWD; int retval; + /* It is non-sense for kernel threads to call execve */ + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return -EINVAL; + filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 0106eba46d5a..3ef80d000e13 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -145,7 +145,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf)); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -525,7 +525,8 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); + nresult = lookup_one(mnt_user_ns(mnt), nbuf, + target_dir, strlen(nbuf)); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 456c1e89386a..6d8b2bf14de0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,13 +98,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs) { - if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) - set_ckpt_flags(sbi, CP_ERROR_FLAG); - } else { - sbi->metapage_eio_ofs = page->index; - sbi->metapage_eio_cnt = 0; - } + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -158,7 +152,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -196,7 +190,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); @@ -1010,9 +1004,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8f38c26bb16c..7fcbcf979737 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -585,6 +584,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { @@ -2564,7 +2591,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; @@ -2601,6 +2633,7 @@ got_it: err = -EFSCORRUPTED; goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2737,11 +2770,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { @@ -3314,6 +3342,100 @@ unlock_out: return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + return __reserve_data_block(cow_inode, index, blk_addr, + node_changed); + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { @@ -3321,7 +3443,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3332,14 +3454,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3387,7 +3501,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3443,8 +3561,6 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3488,8 +3604,12 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -3522,9 +3642,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) inode->i_ino == F2FS_COMPRESS_INO(sbi)) clear_page_private_data(&folio->page); - if (page_private_atomic(&folio->page)) - return f2fs_drop_inmem_page(inode, &folio->page); - folio_detach_private(folio); } @@ -3536,10 +3653,6 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - /* This is atomic written page, keep Private */ - if (page_private_atomic(&folio->page)) - return false; - sbi = F2FS_M_SB(folio->mapping); if (test_opt(sbi, COMPRESS_CACHE)) { struct inode *inode = folio->mapping->host; @@ -3565,18 +3678,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(&folio->page)) { - f2fs_register_inmem_page(inode, &folio->page); - return true; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return false; - } - if (!folio_test_dirty(folio)) { filemap_dirty_folio(mapping, folio); f2fs_update_dirty_folio(inode, folio); @@ -3656,42 +3757,14 @@ out: int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fcdf253cd211..c92625ef16d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,11 +91,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -167,8 +164,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,7 +291,6 @@ get_cache: sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -491,10 +485,6 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -519,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -623,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a0e51937d92e..d5bd7932fb64 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, #if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; - if (IS_CASEFOLDED(dir)) { + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 10d1f138d14f..d9bbecd008d2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -509,11 +509,11 @@ struct f2fs_filename { #if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode, if the directory is both - * casefolded and encrypted and its encryption key is unavailable, or if - * the filesystem is doing an internal operation where usr_fname is also - * NULL. In all these cases we fall back to treating the name as an - * opaque byte sequence. + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. */ struct fscrypt_str cf_name; #endif @@ -579,8 +579,8 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 -/* maximum retry of EIO'ed meta page */ -#define MAX_RETRY_META_PAGE_EIO 100 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ @@ -717,7 +717,6 @@ enum { enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -739,8 +738,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ @@ -753,7 +750,6 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -795,11 +791,9 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; @@ -1093,7 +1087,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1118,16 +1111,12 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1277,6 +1266,15 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1615,8 +1613,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ - pgoff_t metapage_eio_ofs; /* EIO page offset */ - int metapage_eio_cnt; /* EIO count */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -1719,7 +1717,6 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1750,9 +1747,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1763,7 +1758,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */ struct completion s_kobj_unregister; @@ -2606,11 +2601,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; @@ -3173,6 +3174,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); @@ -3203,16 +3208,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); -} - static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); @@ -3445,6 +3440,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3580,11 +3577,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3726,6 +3720,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3787,8 +3782,7 @@ extern const struct iomap_ops f2fs_iomap_ops; int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); @@ -3816,7 +3810,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3834,7 +3827,7 @@ struct f2fs_stat_info { int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3846,7 +3839,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -3946,17 +3938,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -4018,9 +3999,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -4053,6 +4031,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, @@ -4422,8 +4401,7 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } @@ -4431,8 +4409,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) @@ -4540,6 +4518,21 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 100637b1adb3..bd14cef1b08f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -372,7 +372,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); @@ -1437,11 +1438,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); @@ -1638,6 +1647,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1675,8 +1689,8 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) goto out_err; } @@ -1766,6 +1780,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; @@ -1804,16 +1822,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1828,8 +1838,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * before dropping file lock, it needs to do in ->flush. */ if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1992,6 +2002,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2014,44 +2025,55 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2076,127 +2098,20 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_start_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - +unlock_out: inode_unlock(inode); - mnt_drop_write_file(filp); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -2437,6 +2352,10 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2462,7 +2381,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2471,6 +2392,12 @@ out: static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2498,8 +2425,8 @@ do_more: f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2674,6 +2601,7 @@ do_map: } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; @@ -2913,6 +2841,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2956,7 +2889,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) @@ -3017,7 +2952,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -3037,7 +2972,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3987,7 +3922,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || @@ -4010,11 +3945,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4082,11 +4012,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4136,11 +4061,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: @@ -4328,17 +4251,39 @@ out: static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (f2fs_should_use_dio(inode, iocb, to)) - return f2fs_dio_read_iter(iocb, to); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } - ret = filemap_read(iocb, to, 0); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4630,14 +4575,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from, dio); - if (preallocated < 0) + if (preallocated < 0) { ret = preallocated; - else + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync): f2fs_buffered_write_iter(iocb, from); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea5b93b689cd..d5fb426e0747 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,6 +35,10 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -141,8 +145,12 @@ do_gc: if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) + if (f2fs_gc(sbi, &gc_control)) wait_ms = gc_th->no_gc_sleep_time; if (foreground) @@ -646,6 +654,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -787,6 +843,9 @@ retry: if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -1194,18 +1253,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - - if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1344,18 +1394,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1475,11 +1516,19 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; + } + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); @@ -1699,23 +1748,21 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1726,7 +1773,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1743,8 +1789,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1754,54 +1799,69 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) - skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; + } - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; } + } - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) - ret = f2fs_write_checkpoint(sbi, &cpc); + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1816,7 +1876,7 @@ stop: put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 3cb1e7a24740..049ce50cec9b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len) /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also - * @fname->cf_name if the filename is valid Unicode. + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { @@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that - * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. Note - * that to handle encrypted directories, the fallback must use - * usr_fname (plaintext) rather than disk_name (ciphertext). + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a578bf83b803..bf46a7dfbea2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -14,21 +14,40 @@ #include "node.h" #include <trace/events/f2fs.h> -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 83639238a1fe..fc55f5bd1fcc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); @@ -466,10 +465,10 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -745,9 +744,8 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -796,8 +794,22 @@ retry: f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5ed79b29999f..c549acb52ac4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; @@ -461,6 +458,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -836,8 +840,8 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, - struct inode **whiteout) + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -851,7 +855,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -876,21 +880,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -911,7 +919,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, @@ -921,7 +929,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns, return -EIO; return __f2fs_tmpfile(mnt_userns, dir, NULL, - S_IFCHR | WHITEOUT_MODE, whiteout); + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); } static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ccff18560ff..836c79a20afc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; @@ -1416,8 +1412,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1443,19 +1438,21 @@ repeat: goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1631,7 +1628,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4c1d34bfea78..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,7 +147,6 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7225ce09f3ab..874c1b9c41a2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -30,7 +30,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -185,301 +185,175 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { - struct inmem_pages *new; - - set_page_private_atomic(page); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); + if (f2fs_is_atomic_file(inode)) { + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + clear_inode_flag(inode, FI_ATOMIC_FILE); - trace_f2fs_register_inmem_page(page, INMEM); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } } -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; - - list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; + struct dnode_of_data dn; + struct node_info ni; + int err; - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni, false); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); - - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + return err; } - return err; -} -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); + } else { + blkcnt_t count = 1; - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); + f2fs_put_dnode(&dn); + return 0; } -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) { - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - - f2fs_bug_on(sbi, !page_private_atomic(page)); + struct revoke_entry *cur, *tmp; - mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) - break; + list_for_each_entry_safe(cur, tmp, head, list) { + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); + list_del(&cur->list); + kmem_cache_free(revoke_entry_slab, cur); } - - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __f2fs_commit_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; + } - f2fs_wait_on_page_writeback(page, DATA, true, true); + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - unlock_page(page); - break; + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; - } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); - } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - return err; +out: + __complete_revoke_list(inode, &revoke_list, ret ? true : false); + + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -520,8 +394,15 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + f2fs_gc(sbi, &gc_control); } } } @@ -1664,33 +1545,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } @@ -3286,8 +3166,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -4084,10 +3963,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) @@ -4455,7 +4336,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, @@ -4480,8 +4361,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { /* build discard map only one time */ @@ -4521,15 +4402,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -4551,13 +4432,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -4637,6 +4529,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -5225,6 +5124,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } @@ -5335,9 +5235,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5356,5 +5256,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c94caf0c0a1..3f277dfcb131 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,7 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) @@ -224,10 +225,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { @@ -294,6 +295,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ @@ -572,11 +576,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; @@ -602,19 +605,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ed3e8b7a8260..37221e94e5ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -525,10 +525,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); + return 0; #else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; #endif - return 0; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -1339,9 +1340,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1382,9 +1380,8 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1707,18 +1704,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1731,14 +1733,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } @@ -2055,7 +2055,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - unsigned int gc_mode; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2066,14 +2066,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); - gc_mode = sbi->gc_mode; sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; @@ -2094,6 +2105,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } +skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); @@ -2684,7 +2696,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; - inode_lock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); /* * do_quotactl @@ -2703,7 +2716,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); - inode_unlock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); if (ret) break; @@ -3648,22 +3662,29 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); + return -EINVAL; + } + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, @@ -4070,30 +4091,9 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 65395ae188aa..7b8f2b41c29b 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -129,7 +129,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* diff --git a/fs/file.c b/fs/file.c index ee9317346702..3bcc1ecc314a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -630,32 +630,23 @@ EXPORT_SYMBOL(fd_install); * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * - * If this functions returns an EINVAL error pointer the fd was beyond the - * current maximum number of file descriptors for that fdtable. + * Context: files_lock must be held. * - * Returns: The file associated with @fd, on error returns an error pointer. + * Returns: The file associated with @fd (NULL if @fd is not open) */ static struct file *pick_file(struct files_struct *files, unsigned fd) { + struct fdtable *fdt = files_fdtable(files); struct file *file; - struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) { - file = ERR_PTR(-EINVAL); - goto out_unlock; - } + if (fd >= fdt->max_fds) + return NULL; + file = fdt->fd[fd]; - if (!file) { - file = ERR_PTR(-EBADF); - goto out_unlock; + if (file) { + rcu_assign_pointer(fdt->fd[fd], NULL); + __put_unused_fd(files, fd); } - rcu_assign_pointer(fdt->fd[fd], NULL); - __put_unused_fd(files, fd); - -out_unlock: - spin_unlock(&files->file_lock); return file; } @@ -664,8 +655,10 @@ int close_fd(unsigned fd) struct files_struct *files = current->files; struct file *file; + spin_lock(&files->file_lock); file = pick_file(files, fd); - if (IS_ERR(file)) + spin_unlock(&files->file_lock); + if (!file) return -EBADF; return filp_close(file, files); @@ -702,20 +695,25 @@ static inline void __range_cloexec(struct files_struct *cur_fds, static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { + unsigned n; + + rcu_read_lock(); + n = last_fd(files_fdtable(cur_fds)); + rcu_read_unlock(); + max_fd = min(max_fd, n); + while (fd <= max_fd) { struct file *file; + spin_lock(&cur_fds->file_lock); file = pick_file(cur_fds, fd++); - if (!IS_ERR(file)) { + spin_unlock(&cur_fds->file_lock); + + if (file) { /* found a valid file to close */ filp_close(file, cur_fds); cond_resched(); - continue; } - - /* beyond the last fd in that table */ - if (PTR_ERR(file) == -EINVAL) - return; } } @@ -795,43 +793,25 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) * See close_fd_get_file() below, this variant assumes current->files->file_lock * is held. */ -int __close_fd_get_file(unsigned int fd, struct file **res) +struct file *__close_fd_get_file(unsigned int fd) { - struct files_struct *files = current->files; - struct file *file; - struct fdtable *fdt; - - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_err; - file = fdt->fd[fd]; - if (!file) - goto out_err; - rcu_assign_pointer(fdt->fd[fd], NULL); - __put_unused_fd(files, fd); - get_file(file); - *res = file; - return 0; -out_err: - *res = NULL; - return -ENOENT; + return pick_file(current->files, fd); } /* * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file, and then - * an fput(). + * The caller must ensure that filp_close() called on the file. */ -int close_fd_get_file(unsigned int fd, struct file **res) +struct file *close_fd_get_file(unsigned int fd) { struct files_struct *files = current->files; - int ret; + struct file *file; spin_lock(&files->file_lock); - ret = __close_fd_get_file(fd, res); + file = pick_file(files, fd); spin_unlock(&files->file_lock); - return ret; + return file; } void do_close_on_exec(struct files_struct *files) @@ -871,7 +851,7 @@ void do_close_on_exec(struct files_struct *files) } static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask, unsigned int refs) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -897,10 +877,9 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * Such a race can take two forms: * * (a) the file ref already went down to zero, - * and get_file_rcu_many() fails. Just try - * again: + * and get_file_rcu() fails. Just try again: */ - if (unlikely(!get_file_rcu_many(file, refs))) + if (unlikely(!get_file_rcu(file))) continue; /* @@ -909,11 +888,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * - * If so, we need to put our refs and try again. + * If so, we need to put our ref and try again. */ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { - fput_many(file, refs); + fput(file); continue; } @@ -926,37 +905,31 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, } static struct file *__fget_files(struct files_struct *files, unsigned int fd, - fmode_t mask, unsigned int refs) + fmode_t mask) { struct file *file; rcu_read_lock(); - file = __fget_files_rcu(files, fd, mask, refs); + file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } -static inline struct file *__fget(unsigned int fd, fmode_t mask, - unsigned int refs) -{ - return __fget_files(current->files, fd, mask, refs); -} - -struct file *fget_many(unsigned int fd, unsigned int refs) +static inline struct file *__fget(unsigned int fd, fmode_t mask) { - return __fget(fd, FMODE_PATH, refs); + return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { - return __fget(fd, FMODE_PATH, 1); + return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - return __fget(fd, 0, 1); + return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); @@ -966,7 +939,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) task_lock(task); if (task->files) - file = __fget_files(task->files, fd, 0, 1); + file = __fget_files(task->files, fd, 0); task_unlock(task); return file; @@ -1035,7 +1008,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask, 1); + file = __fget(fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; diff --git a/fs/file_table.c b/fs/file_table.c index ada8fe814db9..5424e3a8df5f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -368,9 +368,9 @@ EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); -void fput_many(struct file *file, unsigned int refs) +void fput(struct file *file) { - if (atomic_long_sub_and_test(refs, &file->f_count)) { + if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { @@ -389,11 +389,6 @@ void fput_many(struct file *file, unsigned int refs) } } -void fput(struct file *file) -{ - fput_many(file, 1); -} - /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h index a41ea0ba6943..bffd156d6434 100644 --- a/fs/freevxfs/vxfs.h +++ b/fs/freevxfs/vxfs.h @@ -1,32 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_SUPER_H_ #define _VXFS_SUPER_H_ diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index 1fd41cf98b9f..de2a5bccb930 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index acc5477b3f23..fbcd603365ad 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -1,31 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_DIR_H_ #define _VXFS_DIR_H_ diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index f5c428e21024..3a2180c5e208 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -1,31 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_EXTERN_H_ #define _VXFS_EXTERN_H_ diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c index a4610a77649e..c1174a3f8990 100644 --- a/fs/freevxfs/vxfs_fshead.c +++ b/fs/freevxfs/vxfs_fshead.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h index e026f0c49159..dfd2147599c4 100644 --- a/fs/freevxfs/vxfs_fshead.h +++ b/fs/freevxfs/vxfs_fshead.h @@ -1,32 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_FSHEAD_H_ #define _VXFS_FSHEAD_H_ diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index a37431e443d3..c2ef9f0debbd 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 1f41b25ef38b..ceb6a12649ba 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h index f012abed125d..1e9e138d2b33 100644 --- a/fs/freevxfs/vxfs_inode.h +++ b/fs/freevxfs/vxfs_inode.h @@ -1,32 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_INODE_H_ #define _VXFS_INODE_H_ diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index a51425634f65..f04ba2ed1e1a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 813da6685151..23f35187c289 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h index 0c0b0c9fa557..53afba08d617 100644 --- a/fs/freevxfs/vxfs_olt.h +++ b/fs/freevxfs/vxfs_olt.h @@ -1,31 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * */ #ifndef _VXFS_OLT_H_ #define _VXFS_OLT_H_ diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index 6143ebab940d..0e633d2bfc7d 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -1,30 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 22eed5a73ac2..c3b82f716f9a 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -1,31 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL"). - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. */ /* diff --git a/fs/fsopen.c b/fs/fsopen.c index 27a890aa493a..fc9d2d9fd234 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -119,7 +119,7 @@ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) const char *fs_name; int ret; - if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; if (flags & ~FSOPEN_CLOEXEC) @@ -162,7 +162,7 @@ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags unsigned int lookup_flags; int ret; - if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; if ((flags & ~(FSPICK_CLOEXEC | diff --git a/fs/internal.h b/fs/internal.h index 9a6c233ee7f1..87e96b9024ce 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -84,6 +84,7 @@ extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write_file(struct file *); extern void dissolve_on_fput(struct vfsmount *); +extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); @@ -125,7 +126,7 @@ extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); -extern int __close_fd_get_file(unsigned int fd, struct file **res); +extern struct file *__close_fd_get_file(unsigned int fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); diff --git a/fs/io_uring.c b/fs/io_uring.c index 9f1c682d7caf..3aab4182fd89 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -112,7 +112,8 @@ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_CREDS | REQ_F_ASYNC_DATA) + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) @@ -540,6 +541,7 @@ struct io_uring_task { const struct io_ring_ctx *last; struct io_wq *io_wq; struct percpu_counter inflight; + atomic_t inflight_tracked; atomic_t in_idle; spinlock_t task_lock; @@ -574,6 +576,7 @@ struct io_close { struct file *file; int fd; u32 file_slot; + u32 flags; }; struct io_timeout_data { @@ -1355,8 +1358,6 @@ static void io_clean_op(struct io_kiocb *req); static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); static struct file *io_file_get_normal(struct io_kiocb *req, int fd); -static void io_drop_inflight_file(struct io_kiocb *req); -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags); static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1366,7 +1367,9 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); -static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); +static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset); +static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); @@ -1757,9 +1760,29 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) __must_hold(&req->ctx->timeout_lock) { + struct io_kiocb *req; + if (task && head->task != task) return false; - return cancel_all; + if (cancel_all) + return true; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; } /* @@ -1769,9 +1792,24 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all) { + bool matched; + if (task && head->task != task) return false; - return cancel_all; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; } static inline bool req_has_async_data(struct io_kiocb *req) @@ -1927,6 +1965,14 @@ static inline bool io_req_ffs_set(struct io_kiocb *req) return req->flags & REQ_F_FIXED_FILE; } +static inline void io_req_track_inflight(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_INFLIGHT)) { + req->flags |= REQ_F_INFLIGHT; + atomic_inc(¤t->io_uring->inflight_tracked); + } +} + static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -2988,8 +3034,6 @@ static void __io_req_task_work_add(struct io_kiocb *req, unsigned long flags; bool running; - io_drop_inflight_file(req); - spin_lock_irqsave(&tctx->task_lock, flags); wq_list_add_tail(&req->io_task_work.node, list); running = tctx->task_running; @@ -4176,6 +4220,16 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) return 0; } +static int io_readv_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, READ); +} + +static int io_writev_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, WRITE); +} + /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -5103,42 +5157,6 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_shutdown_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_NET) - if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - req->shutdown.how = READ_ONCE(sqe->len); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_NET) - struct socket *sock; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = __sys_shutdown_sock(sock, req->shutdown.how); - io_req_complete(req, ret); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5445,15 +5463,11 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) unsigned long nr = ctx->nr_user_files; int ret; - if (table->alloc_hint >= nr) - table->alloc_hint = 0; - do { ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); - if (ret != nr) { - table->alloc_hint = ret + 1; + if (ret != nr) return ret; - } + if (!table->alloc_hint) break; @@ -5464,6 +5478,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) return -ENFILE; } +/* + * Note when io_fixed_fd_install() returns error value, it will ensure + * fput() is called correspondingly. + */ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot) { @@ -5471,26 +5489,24 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct io_ring_ctx *ctx = req->ctx; int ret; + io_ring_submit_lock(ctx, issue_flags); + if (alloc_slot) { - io_ring_submit_lock(ctx, issue_flags); ret = io_file_bitmap_get(ctx); - if (unlikely(ret < 0)) { - io_ring_submit_unlock(ctx, issue_flags); - return ret; - } - + if (unlikely(ret < 0)) + goto err; file_slot = ret; } else { file_slot--; } ret = io_install_fixed_file(req, file, issue_flags, file_slot); - if (alloc_slot) { - io_ring_submit_unlock(ctx, issue_flags); - if (!ret) - return file_slot; - } - + if (!ret && alloc_slot) + ret = file_slot; +err: + io_ring_submit_unlock(ctx, issue_flags); + if (unlikely(ret < 0)) + fput(file); return ret; } @@ -5972,14 +5988,18 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); req->close.file_slot = READ_ONCE(sqe->file_index); - if (req->close.file_slot && req->close.fd) + req->close.flags = READ_ONCE(sqe->close_flags); + if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT) + return -EINVAL; + if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) && + req->close.file_slot && req->close.fd) return -EINVAL; return 0; @@ -5990,12 +6010,13 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) struct files_struct *files = current->files; struct io_close *close = &req->close; struct fdtable *fdt; - struct file *file = NULL; + struct file *file; int ret = -EBADF; if (req->close.file_slot) { ret = io_close_fixed(req, issue_flags); - goto err; + if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT)) + goto err; } spin_lock(&files->file_lock); @@ -6008,7 +6029,6 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) lockdep_is_held(&files->file_lock)); if (!file || file->f_op == &io_uring_fops) { spin_unlock(&files->file_lock); - file = NULL; goto err; } @@ -6018,21 +6038,16 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - ret = __close_fd_get_file(close->fd, &file); + file = __close_fd_get_file(close->fd); spin_unlock(&files->file_lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = -EBADF; + if (!file) goto err; - } /* No ->flush() or already async, safely close from here */ ret = filp_close(file, current->files); err: if (ret < 0) req_set_fail(req); - if (file) - fput(file); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -6063,6 +6078,34 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) } #if defined(CONFIG_NET) +static int io_shutdown_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + req->shutdown.how = READ_ONCE(sqe->len); + return 0; +} + +static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) +{ + struct socket *sock; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_shutdown_sock(sock, req->shutdown.how); + io_req_complete(req, ret); + return 0; +} + static bool io_net_retry(struct socket *sock, int flags) { if (!(flags & MSG_WAITALL)) @@ -6674,8 +6717,8 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags) fd_install(fd, file); ret = fd; } else { - ret = io_install_fixed_file(req, file, issue_flags, - sock->file_slot - 1); + ret = io_fixed_fd_install(req, issue_flags, file, + sock->file_slot); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -6767,6 +6810,7 @@ IO_NETOP_PREP_ASYNC(recvmsg); IO_NETOP_PREP_ASYNC(connect); IO_NETOP_PREP(accept); IO_NETOP_PREP(socket); +IO_NETOP_PREP(shutdown); IO_NETOP_FN(send); IO_NETOP_FN(recv); #endif /* CONFIG_NET */ @@ -6905,10 +6949,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; - unsigned flags = locked ? 0 : IO_URING_F_UNLOCKED; - - if (unlikely(!io_assign_file(req, flags))) - return -EBADF; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; } @@ -7390,7 +7430,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); } -static int io_poll_update_prep(struct io_kiocb *req, +static int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_update *upd = &req->poll_update; @@ -7454,7 +7494,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) +static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; struct io_ring_ctx *ctx = req->ctx; @@ -7698,8 +7738,9 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool is_timeout_link) +static int __io_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool is_timeout_link) { struct io_timeout_data *data; unsigned flags; @@ -7754,6 +7795,18 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static int io_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, false); +} + +static int io_link_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, true); +} + static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -7970,7 +8023,7 @@ done: return 0; } -static int io_rsrc_update_prep(struct io_kiocb *req, +static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) @@ -7986,6 +8039,41 @@ static int io_rsrc_update_prep(struct io_kiocb *req, return 0; } +static int io_files_update_with_index_alloc(struct io_kiocb *req, + unsigned int issue_flags) +{ + __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg); + unsigned int done; + struct file *file; + int ret, fd; + + for (done = 0; done < req->rsrc_update.nr_args; done++) { + if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + ret = -EFAULT; + break; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + break; + } + ret = io_fixed_fd_install(req, issue_flags, file, + IORING_FILE_INDEX_ALLOC); + if (ret < 0) + break; + if (copy_to_user(&fds[done], &ret, sizeof(ret))) { + ret = -EFAULT; + __io_close_fixed(req, issue_flags, ret); + break; + } + } + + if (done) + return done; + return ret; +} + static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -7999,10 +8087,14 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) up.resv = 0; up.resv2 = 0; - io_ring_submit_lock(ctx, issue_flags); - ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up, req->rsrc_update.nr_args); - io_ring_submit_unlock(ctx, issue_flags); + if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) { + ret = io_files_update_with_index_alloc(req, issue_flags); + } else { + io_ring_submit_lock(ctx, issue_flags); + ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, + &up, req->rsrc_update.nr_args); + io_ring_submit_unlock(ctx, issue_flags); + } if (ret < 0) req_set_fail(req); @@ -8025,7 +8117,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE: - return io_poll_update_prep(req, sqe); + return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: return io_fsync_prep(req, sqe); case IORING_OP_SYNC_FILE_RANGE: @@ -8039,13 +8131,13 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_CONNECT: return io_connect_prep(req, sqe); case IORING_OP_TIMEOUT: - return io_timeout_prep(req, sqe, false); + return io_timeout_prep(req, sqe); case IORING_OP_TIMEOUT_REMOVE: return io_timeout_remove_prep(req, sqe); case IORING_OP_ASYNC_CANCEL: return io_async_cancel_prep(req, sqe); case IORING_OP_LINK_TIMEOUT: - return io_timeout_prep(req, sqe, true); + return io_link_timeout_prep(req, sqe); case IORING_OP_ACCEPT: return io_accept_prep(req, sqe); case IORING_OP_FALLOCATE: @@ -8055,7 +8147,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_CLOSE: return io_close_prep(req, sqe); case IORING_OP_FILES_UPDATE: - return io_rsrc_update_prep(req, sqe); + return io_files_update_prep(req, sqe); case IORING_OP_STATX: return io_statx_prep(req, sqe); case IORING_OP_FADVISE: @@ -8123,9 +8215,9 @@ static int io_req_prep_async(struct io_kiocb *req) switch (req->opcode) { case IORING_OP_READV: - return io_rw_prep_async(req, READ); + return io_readv_prep_async(req); case IORING_OP_WRITEV: - return io_rw_prep_async(req, WRITE); + return io_writev_prep_async(req); case IORING_OP_SENDMSG: return io_sendmsg_prep_async(req); case IORING_OP_RECVMSG: @@ -8264,6 +8356,11 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } + if (req->flags & REQ_F_INFLIGHT) { + struct io_uring_task *tctx = req->task->io_uring; + + atomic_dec(&tctx->inflight_tracked); + } if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -8288,6 +8385,7 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { + const struct io_op_def *def = &io_op_defs[req->opcode]; const struct cred *creds = NULL; int ret; @@ -8297,7 +8395,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); - if (!io_op_defs[req->opcode].audit_skip) + if (!def->audit_skip) audit_uring_entry(req->opcode); switch (req->opcode) { @@ -8321,7 +8419,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = io_poll_add(req, issue_flags); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_update(req, issue_flags); + ret = io_poll_remove(req, issue_flags); break; case IORING_OP_SYNC_FILE_RANGE: ret = io_sync_file_range(req, issue_flags); @@ -8436,7 +8534,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) break; } - if (!io_op_defs[req->opcode].audit_skip) + if (!def->audit_skip) audit_uring_exit(!ret, ret); if (creds) @@ -8569,19 +8667,6 @@ out: return file; } -/* - * Drop the file for requeue operations. Only used of req->file is the - * io_uring descriptor itself. - */ -static void io_drop_inflight_file(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_INFLIGHT)) { - fput(req->file); - req->file = NULL; - req->flags &= ~REQ_F_INFLIGHT; - } -} - static struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); @@ -8590,7 +8675,7 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) /* we don't allow fixed io_uring files */ if (file && file->f_op == &io_uring_fops) - req->flags |= REQ_F_INFLIGHT; + io_req_track_inflight(req); return file; } @@ -8788,6 +8873,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { + const struct io_op_def *def; unsigned int sqe_flags; int personality; u8 opcode; @@ -8805,12 +8891,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return -EINVAL; } + def = &io_op_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; if (sqe_flags & IOSQE_BUFFER_SELECT) { - if (!io_op_defs[opcode].buffer_select) + if (!def->buffer_select) return -EOPNOTSUPP; req->buf_index = READ_ONCE(sqe->buf_group); } @@ -8836,12 +8923,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } } - if (!io_op_defs[opcode].ioprio && sqe->ioprio) + if (!def->ioprio && sqe->ioprio) return -EINVAL; - if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) + if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (io_op_defs[opcode].needs_file) { + if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; req->cqe.fd = READ_ONCE(sqe->fd); @@ -8850,7 +8937,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. */ - if (state->need_plug && io_op_defs[opcode].plug) { + if (state->need_plug && def->plug) { state->plug_started = true; state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); @@ -9658,8 +9745,7 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit) { WARN_ON_ONCE(test_bit(bit, table->bitmap)); __set_bit(bit, table->bitmap); - if (bit == table->alloc_hint) - table->alloc_hint++; + table->alloc_hint = bit + 1; } static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) @@ -10113,21 +10199,19 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) + __must_hold(&req->ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; bool needs_switch = false; struct io_fixed_file *file_slot; - int ret = -EBADF; + int ret; - io_ring_submit_lock(ctx, issue_flags); if (file->f_op == &io_uring_fops) - goto err; - ret = -ENXIO; + return -EBADF; if (!ctx->file_data) - goto err; - ret = -EINVAL; + return -ENXIO; if (slot_index >= ctx->nr_user_files) - goto err; + return -EINVAL; slot_index = array_index_nospec(slot_index, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); @@ -10158,15 +10242,14 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, err: if (needs_switch) io_rsrc_node_switch(ctx, ctx->file_data); - io_ring_submit_unlock(ctx, issue_flags); if (ret) fput(file); return ret; } -static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset) { - unsigned int offset = req->close.file_slot - 1; struct io_ring_ctx *ctx = req->ctx; struct io_fixed_file *file_slot; struct file *file; @@ -10203,6 +10286,11 @@ out: return ret; } +static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + return __io_close_fixed(req, issue_flags, req->close.file_slot - 1); +} + static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) @@ -10351,6 +10439,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -11046,6 +11135,7 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); __io_remove_buffers(ctx, bl, -1U); + kfree(bl); } while (!list_empty(&ctx->io_buffers_pages)) { @@ -11581,7 +11671,7 @@ static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) - return 0; + return atomic_read(&tctx->inflight_tracked); return percpu_counter_sum(&tctx->inflight); } @@ -11957,14 +12047,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); f.file = tctx->registered_rings[fd]; - if (unlikely(!f.file)) - return -EBADF; + f.flags = 0; } else { f = fdget(fd); - if (unlikely(!f.file)) - return -EBADF; } + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; if (unlikely(f.file->f_op != &io_uring_fops)) goto out_fput; @@ -12062,8 +12152,7 @@ iopoll_locked: out: percpu_ref_put(&ctx->refs); out_fput: - if (!(flags & IORING_ENTER_REGISTERED_RING)) - fdput(f); + fdput(f); return ret; } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 7e9abdb89712..acd32f05b519 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -43,9 +43,9 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, jffs2_dbg(1, "%s(): erase block %#08x (range %#08x-%#08x)\n", __func__, jeb->offset, jeb->offset, jeb->offset + c->sector_size); - instr = kmalloc(sizeof(struct erase_info), GFP_KERNEL); + instr = kzalloc(sizeof(struct erase_info), GFP_KERNEL); if (!instr) { - pr_warn("kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); + pr_warn("kzalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); @@ -57,8 +57,6 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, return; } - memset(instr, 0, sizeof(*instr)); - instr->addr = jeb->offset; instr->len = c->sector_size; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 00a110f40e10..39cec28096a7 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -604,6 +604,7 @@ out_root: jffs2_free_raw_node_refs(c); kvfree(c->blocks); jffs2_clear_xattr_subsystem(c); + jffs2_sum_exit(c); out_inohash: kfree(c->inocache_list); out_wbuf: diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e205fde7163a..6eca72cfa1f2 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -18,7 +18,15 @@ #include "kernfs-internal.h" static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ -static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +/* + * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to + * call pr_cont() while holding rename_lock. Because sometimes pr_cont() + * will perform wakeups when releasing console_sem. Holding rename_lock + * will introduce deadlock if the scheduler reads the kernfs_name in the + * wakeup path. + */ +static DEFINE_SPINLOCK(kernfs_pr_cont_lock); +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -229,12 +237,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); - kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** @@ -248,10 +256,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) unsigned long flags; int sz; - spin_lock_irqsave(&kernfs_rename_lock, flags); + spin_lock_irqsave(&kernfs_pr_cont_lock, flags); - sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf, - sizeof(kernfs_pr_cont_buf)); + sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); if (sz < 0) { pr_cont("(error)"); goto out; @@ -265,7 +273,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn) pr_cont("%s", kernfs_pr_cont_buf); out: - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** @@ -823,13 +831,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); - /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ - spin_lock_irq(&kernfs_rename_lock); + spin_lock_irq(&kernfs_pr_cont_lock); len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len >= sizeof(kernfs_pr_cont_buf)) { - spin_unlock_irq(&kernfs_rename_lock); + spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } @@ -841,7 +848,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, parent = kernfs_find_ns(parent, name, ns); } - spin_unlock_irq(&kernfs_rename_lock); + spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 88423069407c..e3abfa843879 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -33,7 +33,6 @@ static DEFINE_SPINLOCK(kernfs_open_node_lock); static DEFINE_MUTEX(kernfs_open_file_mutex); struct kernfs_open_node { - atomic_t refcnt; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ @@ -530,10 +529,8 @@ static int kernfs_get_open_node(struct kernfs_node *kn, } on = kn->attr.open; - if (on) { - atomic_inc(&on->refcnt); + if (on) list_add_tail(&of->list, &on->files); - } spin_unlock_irq(&kernfs_open_node_lock); mutex_unlock(&kernfs_open_file_mutex); @@ -548,7 +545,6 @@ static int kernfs_get_open_node(struct kernfs_node *kn, if (!new_on) return -ENOMEM; - atomic_set(&new_on->refcnt, 0); atomic_set(&new_on->event, 1); init_waitqueue_head(&new_on->poll); INIT_LIST_HEAD(&new_on->files); @@ -556,17 +552,19 @@ static int kernfs_get_open_node(struct kernfs_node *kn, } /** - * kernfs_put_open_node - put kernfs_open_node - * @kn: target kernfs_nodet + * kernfs_unlink_open_file - Unlink @of from @kn. + * + * @kn: target kernfs_node * @of: associated kernfs_open_file * - * Put @kn->attr.open and unlink @of from the files list. If - * reference count reaches zero, disassociate and free it. + * Unlink @of from list of @kn's associated open files. If list of + * associated open files becomes empty, disassociate and free + * kernfs_open_node. * * LOCKING: * None. */ -static void kernfs_put_open_node(struct kernfs_node *kn, +static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on = kn->attr.open; @@ -578,7 +576,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, if (of) list_del(&of->list); - if (atomic_dec_and_test(&on->refcnt)) + if (list_empty(&on->files)) kn->attr.open = NULL; else on = NULL; @@ -706,7 +704,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) return 0; err_put_node: - kernfs_put_open_node(kn, of); + kernfs_unlink_open_file(kn, of); err_seq_release: seq_release(inode, file); err_free: @@ -752,7 +750,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) mutex_unlock(&kernfs_open_file_mutex); } - kernfs_put_open_node(kn, of); + kernfs_unlink_open_file(kn, of); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); @@ -768,15 +766,24 @@ void kernfs_drain_open_files(struct kernfs_node *kn) if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) return; - spin_lock_irq(&kernfs_open_node_lock); - on = kn->attr.open; - if (on) - atomic_inc(&on->refcnt); - spin_unlock_irq(&kernfs_open_node_lock); - if (!on) + /* + * lockless opportunistic check is safe below because no one is adding to + * ->attr.open at this point of time. This check allows early bail out + * if ->attr.open is already NULL. kernfs_unlink_open_file makes + * ->attr.open NULL only while holding kernfs_open_file_mutex so below + * check under kernfs_open_file_mutex will ensure bailing out if + * ->attr.open became NULL while waiting for the mutex. + */ + if (!kn->attr.open) return; mutex_lock(&kernfs_open_file_mutex); + if (!kn->attr.open) { + mutex_unlock(&kernfs_open_file_mutex); + return; + } + + on = kn->attr.open; list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); @@ -789,8 +796,6 @@ void kernfs_drain_open_files(struct kernfs_node *kn) } mutex_unlock(&kernfs_open_file_mutex); - - kernfs_put_open_node(kn, NULL); } /* diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 208d2cff7bd3..e8f476c5f189 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,7 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; - conn->outstanding_credits = 1; + conn->outstanding_credits = 0; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); @@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work) return 0; } -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len) +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_read) ret = conn->transport->ops->rdma_read(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_write) ret = conn->transport->ops->rdma_write(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 7a59aacb5daa..98c1cbe45ec9 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -122,11 +122,14 @@ struct ksmbd_transport_ops { int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, unsigned int remote_key); - int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - u32 remote_key, u64 remote_offset, u32 remote_len); - int (*rdma_write)(struct ksmbd_transport *t, void *buf, - unsigned int len, u32 remote_key, u64 remote_offset, - u32 remote_len); + int (*rdma_read)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); + int (*rdma_write)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); }; struct ksmbd_transport { @@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index ebe6ca08467a..52aa0adeb951 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -104,7 +104,8 @@ struct ksmbd_startup_request { */ __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ - __u32 reserved[128]; /* Reserved room */ + __u32 smbd_max_io_size; /* smbd read write size */ + __u32 reserved[127]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 1e2076a53bed..df991107ad2c 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -20,7 +20,7 @@ * wildcard '*' and '?' * TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR * - * @string: string to compare with a pattern + * @str: string to compare with a pattern * @len: string length * @pattern: pattern string which might include wildcard '*' and '?' * @@ -152,8 +152,8 @@ out: /** * convert_to_nt_pathname() - extract and return windows path string * whose share directory prefix was removed from file path - * @filename : unix filename - * @sharepath: share path string + * @share: ksmbd_share_config pointer + * @path: path to report * * Return : windows path string or error */ @@ -250,8 +250,8 @@ char *ksmbd_extract_sharename(char *treename) /** * convert_to_unix_name() - convert windows name to unix format - * @path: name to be converted - * @tid: tree id of mathing share + * @share: ksmbd_share_config pointer + * @name: file name that is relative to share * * Return: converted name on success, otherwise NULL */ diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 4a9460153b59..f8f456377a51 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -338,7 +338,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ret = 1; } - if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + if ((u64)conn->outstanding_credits + credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", credit_charge, conn->outstanding_credits); ret = 1; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 16c803a9d996..e6f4ccc12f49 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3938,6 +3938,12 @@ int smb2_query_dir(struct ksmbd_work *work) set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); + /* + * req->OutputBufferLength is too small to contain even one entry. + * In this case, it immediately returns OutputBufferLength 0 to client. + */ + if (!d_info.out_buf_len && !d_info.num_entry) + goto no_buf_len; if (rc == 0) restart_ctx(&dir_fp->readdir_data.ctx); if (rc == -ENOSPC) @@ -3964,10 +3970,12 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->Buffer[0] = 0; inc_rfc1001_len(work->response_buf, 9); } else { +no_buf_len: ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; - d_info.data_count -= d_info.last_entry_off_align; + if (d_info.data_count >= d_info.last_entry_off_align) + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); @@ -6116,7 +6124,6 @@ out: static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, struct smb2_buffer_desc_v1 *desc, __le32 Channel, - __le16 ChannelInfoOffset, __le16 ChannelInfoLength) { unsigned int i, ch_count; @@ -6134,15 +6141,13 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, le32_to_cpu(desc[i].length)); } } - if (ch_count != 1) { - ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", - ch_count); + if (!ch_count) return -EINVAL; - } work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); + if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) + work->remote_key = le32_to_cpu(desc->token); return 0; } @@ -6150,14 +6155,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, struct smb2_read_req *req, void *data_buf, size_t length) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), + le16_to_cpu(req->ReadChannelInfoLength)); if (err) return err; @@ -6180,6 +6183,8 @@ int smb2_read(struct ksmbd_work *work) size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; int err = 0; + bool is_rdma_channel = false; + unsigned int max_read_size = conn->vals->max_read_size; WORK_BUFFERS(work, req, rsp); @@ -6191,6 +6196,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { + is_rdma_channel = true; + max_read_size = get_smbd_max_read_write_size(); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { @@ -6201,7 +6211,6 @@ int smb2_read(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->ReadChannelInfoOffset, req->ReadChannelInfoLength); if (err) goto out; @@ -6223,9 +6232,9 @@ int smb2_read(struct ksmbd_work *work) length = le32_to_cpu(req->Length); mincount = le32_to_cpu(req->MinimumCount); - if (length > conn->vals->max_read_size) { + if (length > max_read_size) { ksmbd_debug(SMB, "limiting read size to max size(%u)\n", - conn->vals->max_read_size); + max_read_size); err = -EINVAL; goto out; } @@ -6257,8 +6266,7 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n", nbytes, offset, mincount); - if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || - req->Channel == SMB2_CHANNEL_RDMA_V1) { + if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, work->aux_payload_buf, @@ -6328,23 +6336,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) length = le32_to_cpu(req->Length); id = req->VolatileFileId; - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length); if (rpc_resp) { if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { @@ -6384,21 +6387,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t offset, size_t length, bool sync) { - struct smb2_buffer_desc_v1 *desc; char *data_buf; int ret; ssize_t nbytes; - desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), + le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { kvfree(data_buf); return ret; @@ -6427,8 +6427,9 @@ int smb2_write(struct ksmbd_work *work) size_t length; ssize_t nbytes; char *data_buf; - bool writethrough = false; + bool writethrough = false, is_rdma_channel = false; int err = 0; + unsigned int max_write_size = work->conn->vals->max_write_size; WORK_BUFFERS(work, req, rsp); @@ -6437,8 +6438,17 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + offset = le64_to_cpu(req->Offset); + length = le32_to_cpu(req->Length); + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + is_rdma_channel = true; + max_write_size = get_smbd_max_read_write_size(); + length = le32_to_cpu(req->RemainingBytes); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); if (req->Length != 0 || req->DataOffset != 0 || @@ -6450,7 +6460,6 @@ int smb2_write(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->WriteChannelInfoOffset, req->WriteChannelInfoLength); if (err) goto out; @@ -6474,12 +6483,9 @@ int smb2_write(struct ksmbd_work *work) goto out; } - offset = le64_to_cpu(req->Offset); - length = le32_to_cpu(req->Length); - - if (length > work->conn->vals->max_write_size) { + if (length > max_write_size) { ksmbd_debug(SMB, "limiting write size to max size(%u)\n", - work->conn->vals->max_write_size); + max_write_size); err = -EINVAL; goto out; } @@ -6487,24 +6493,17 @@ int smb2_write(struct ksmbd_work *work) if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; - if (req->Channel != SMB2_CHANNEL_RDMA_V1 && - req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if (is_rdma_channel == false) { + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) @@ -6520,8 +6519,7 @@ int smb2_write(struct ksmbd_work *work) /* read data from the client using rdma channel, and * write the data. */ - nbytes = smb2_write_rdma_channel(work, req, fp, offset, - le32_to_cpu(req->RemainingBytes), + nbytes = smb2_write_rdma_channel(work, req, fp, offset, length, writethrough); if (nbytes < 0) { err = (int)nbytes; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 9a7e211dbf4f..7f8ab14fb8ec 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -140,8 +140,10 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) hdr = work->request_buf; if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER && - hdr->Command == SMB_COM_NEGOTIATE) + hdr->Command == SMB_COM_NEGOTIATE) { + work->conn->outstanding_credits++; return 0; + } return -EINVAL; } diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 6ecf55ea1fed..38f23bf981ac 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1261,6 +1261,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, if (!access_bits) access_bits = SET_MINIMUM_RIGHTS; + posix_acl_release(posix_acls); goto check_access_bits; } } diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 3ad6881e0f7e..7cb0eeb07c80 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -26,6 +26,7 @@ #include "mgmt/ksmbd_ida.h" #include "connection.h" #include "transport_tcp.h" +#include "transport_rdma.h" #define IPC_WAIT_TIMEOUT (2 * HZ) @@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_trans_size(req->smb2_max_trans); if (req->smb2_max_credits) init_smb2_max_credits(req->smb2_max_credits); + if (req->smbd_max_io_size) + init_smbd_max_io_size(req->smbd_max_io_size); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index e646d79554b8..d035e060c2f0 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 524224; - -static int smb_direct_max_outstanding_rw_ops = 8; +static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; static LIST_HEAD(smb_direct_device_list); static DEFINE_RWLOCK(smb_direct_device_lock); @@ -147,18 +145,18 @@ struct smb_direct_transport { atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; - atomic_t rw_avail_ops; + int max_rw_credits; + int pages_per_rw_credit; + atomic_t rw_credits; wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_avail_ops; + wait_queue_head_t wait_rw_credits; mempool_t *sendmsg_mempool; struct kmem_cache *sendmsg_cache; mempool_t *recvmsg_mempool; struct kmem_cache *recvmsg_cache; - wait_queue_head_t wait_send_payload_pending; - atomic_t send_payload_pending; wait_queue_head_t wait_send_pending; atomic_t send_pending; @@ -208,12 +206,25 @@ struct smb_direct_recvmsg { struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; + int status; struct completion *completion; + struct list_head list; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; struct scatterlist sg_list[]; }; +void init_smbd_max_io_size(unsigned int sz) +{ + sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); + smb_direct_max_read_write_size = sz; +} + +unsigned int get_smbd_max_read_write_size(void) +{ + return smb_direct_max_read_write_size; +} + static inline int get_buf_page_count(void *buf, int size) { return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - @@ -377,7 +388,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->reassembly_queue_length = 0; init_waitqueue_head(&t->wait_reassembly_queue); init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_avail_ops); + init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); spin_lock_init(&t->recvmsg_queue_lock); @@ -386,8 +397,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->empty_recvmsg_queue_lock); INIT_LIST_HEAD(&t->empty_recvmsg_queue); - init_waitqueue_head(&t->wait_send_payload_pending); - atomic_set(&t->send_payload_pending, 0); init_waitqueue_head(&t->wait_send_pending); atomic_set(&t->send_pending, 0); @@ -417,8 +426,6 @@ static void free_transport(struct smb_direct_transport *t) wake_up_interruptible(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_payload_pending, - atomic_read(&t->send_payload_pending) == 0); wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); @@ -569,6 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + t->status = SMB_DIRECT_CS_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; @@ -873,13 +881,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); } - if (sendmsg->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); /* iterate and free the list of messages in reverse. the list's head * is invalid. @@ -911,21 +914,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t, { int ret; - if (wr->num_sge > 1) - atomic_inc(&t->send_payload_pending); - else - atomic_inc(&t->send_pending); - + atomic_inc(&t->send_pending); ret = ib_post_send(t->qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (wr->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); smb_direct_disconnect_rdma_connection(t); } return ret; @@ -983,18 +977,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, } static int wait_for_credits(struct smb_direct_transport *t, - wait_queue_head_t *waitq, atomic_t *credits) + wait_queue_head_t *waitq, atomic_t *total_credits, + int needed) { int ret; do { - if (atomic_dec_return(credits) >= 0) + if (atomic_sub_return(needed, total_credits) >= 0) return 0; - atomic_inc(credits); + atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, - atomic_read(credits) > 0 || - t->status != SMB_DIRECT_CS_CONNECTED); + atomic_read(total_credits) >= needed || + t->status != SMB_DIRECT_CS_CONNECTED); if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; @@ -1015,7 +1010,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t, return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); + return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); +} + +static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +{ + return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); +} + +static int calc_rw_credits(struct smb_direct_transport *t, + char *buf, unsigned int len) +{ + return DIV_ROUND_UP(get_buf_page_count(buf, len), + t->pages_per_rw_credit); } static int smb_direct_create_header(struct smb_direct_transport *t, @@ -1086,7 +1093,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen int offset, len; int i = 0; - if (nentries < get_buf_page_count(buf, size)) + if (size <= 0 || nentries < get_buf_page_count(buf, size)) return -EINVAL; offset = offset_in_page(buf); @@ -1118,7 +1125,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, int npages; npages = get_sg_list(buf, size, sg_list, nentries); - if (npages <= 0) + if (npages < 0) return -EINVAL; return ib_dma_map_sg(device, sg_list, npages, dir); } @@ -1313,11 +1320,21 @@ done: * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_payload_pending, - atomic_read(&st->send_payload_pending) == 0); + wait_event(st->wait_send_pending, + atomic_read(&st->send_pending) == 0); return ret; } +static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, + struct smb_direct_rdma_rw_msg *msg, + enum dma_data_direction dir) +{ + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, msg->sgt.nents, dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); +} + static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { @@ -1326,19 +1343,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, struct smb_direct_transport *t = msg->t; if (wc->status != IB_WC_SUCCESS) { + msg->status = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); - smb_direct_disconnect_rdma_connection(t); + if (wc->status != IB_WC_WR_FLUSH_ERR) + smb_direct_disconnect_rdma_connection(t); } - if (atomic_inc_return(&t->rw_avail_ops) > 0) - wake_up(&t->wait_rw_avail_ops); - - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, dir); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); complete(msg->completion); - kfree(msg); } static void read_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1351,94 +1363,141 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) read_write_done(cq, wc, DMA_TO_DEVICE); } -static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, - int buf_len, u32 remote_key, u64 remote_offset, - u32 remote_len, bool is_read) +static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + void *buf, int buf_len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len, + bool is_read) { - struct smb_direct_rdma_rw_msg *msg; - int ret; + struct smb_direct_rdma_rw_msg *msg, *next_msg; + int i, ret; DECLARE_COMPLETION_ONSTACK(completion); - struct ib_send_wr *first_wr = NULL; + struct ib_send_wr *first_wr; + LIST_HEAD(msg_list); + char *desc_buf; + int credits_needed; + unsigned int desc_buf_len; + size_t total_length = 0; + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; - ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + /* calculate needed credits */ + credits_needed = 0; + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + desc_buf_len = le32_to_cpu(desc[i].length); + + credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + desc_buf += desc_buf_len; + total_length += desc_buf_len; + if (desc_buf_len == 0 || total_length > buf_len || + total_length > t->max_rdma_rw_size) + return -EINVAL; + } + + ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", + is_read ? "read" : "write", buf_len, credits_needed); + + ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) return ret; - /* TODO: mempool */ - msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + - sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); - if (!msg) { - atomic_inc(&t->rw_avail_ops); - return -ENOMEM; - } + /* build rdma_rw_ctx for each descriptor */ + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } - msg->sgt.sgl = &msg->sg_list[0]; - ret = sg_alloc_table_chained(&msg->sgt, - get_buf_page_count(buf, buf_len), - msg->sg_list, SG_CHUNK_SIZE); - if (ret) { - atomic_inc(&t->rw_avail_ops); - kfree(msg); - return -ENOMEM; - } + desc_buf_len = le32_to_cpu(desc[i].length); - ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret <= 0) { - pr_err("failed to get pages\n"); - goto err; - } + msg->t = t; + msg->cqe.done = is_read ? read_done : write_done; + msg->completion = &completion; - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, get_buf_page_count(buf, buf_len), - 0, remote_offset, remote_key, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - if (ret < 0) { - pr_err("failed to init rdma_rw_ctx: %d\n", ret); - goto err; + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + get_buf_page_count(desc_buf, desc_buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { + kfree(msg); + ret = -ENOMEM; + goto out; + } + + ret = get_sg_list(desc_buf, desc_buf_len, + msg->sgt.sgl, msg->sgt.orig_nents); + if (ret < 0) { + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, + get_buf_page_count(desc_buf, desc_buf_len), + 0, + le64_to_cpu(desc[i].offset), + le32_to_cpu(desc[i].token), + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + list_add_tail(&msg->list, &msg_list); + desc_buf += desc_buf_len; } - msg->t = t; - msg->cqe.done = is_read ? read_done : write_done; - msg->completion = &completion; - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, - &msg->cqe, NULL); + /* concatenate work requests of rdma_rw_ctxs */ + first_wr = NULL; + list_for_each_entry_reverse(msg, &msg_list, list) { + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + &msg->cqe, first_wr); + } ret = ib_post_send(t->qp, first_wr, NULL); if (ret) { - pr_err("failed to post send wr: %d\n", ret); - goto err; + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); + goto out; } + msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); wait_for_completion(&completion); - return 0; - -err: - atomic_inc(&t->rw_avail_ops); - if (first_wr) - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); + ret = msg->status; +out: + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { + list_del(&msg->list); + smb_direct_free_rdma_rw_msg(t, msg, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + atomic_add(credits_needed, &t->rw_credits); + wake_up(&t->wait_rw_credits); return ret; } -static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_write(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, false); + desc, desc_len, false); } -static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_read(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, true); + desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) @@ -1638,41 +1697,57 @@ out_err: return ret; } +static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +{ + return min_t(unsigned int, + t->cm_id->device->attrs.max_fast_reg_page_list_len, + 256); +} + static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct ib_device *device = t->cm_id->device; - int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; + int max_send_sges, max_rw_wrs, max_send_wrs; + unsigned int max_sge_per_wr, wrs_per_credit; - /* need 2 more sge. because a SMB_DIRECT header will be mapped, - * and maybe a send buffer could be not page aligned. + /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, + * SMB2 response could be mapped. */ t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2; + max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { pr_err("max_send_size %d is too large\n", t->max_send_size); return -EINVAL; } - /* - * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA - * read/writes. HCA guarantees at least max_send_sge of sges for - * a RDMA read/write work request, and if memory registration is used, - * we need reg_mr, local_inv wrs for each read/write. + /* Calculate the number of work requests for RDMA R/W. + * The maximum number of pages which can be registered + * with one Memory region can be transferred with one + * R/W credit. And at least 4 work requests for each credit + * are needed for MR registration, RDMA R/W, local & remote + * MR invalidation. */ t->max_rdma_rw_size = smb_direct_max_read_write_size; - max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); - max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, - max_pages) * 2; - max_rw_wrs *= smb_direct_max_outstanding_rw_ops; + t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); + t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, + (t->pages_per_rw_credit - 1) * + PAGE_SIZE); + + max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, + device->attrs.max_sge_rd); + max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, + max_send_sges); + wrs_per_credit = max_t(unsigned int, 4, + DIV_ROUND_UP(t->pages_per_rw_credit, + max_sge_per_wr) + 1); + max_rw_wrs = t->max_rw_credits * wrs_per_credit; max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { - pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", - smb_direct_send_credit_target, - smb_direct_max_outstanding_rw_ops); + pr_err("consider lowering send_credit_target = %d\n", + smb_direct_send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; @@ -1687,11 +1762,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { - pr_err("warning: device max_send_sge = %d too small\n", - device->attrs.max_send_sge); - return -EINVAL; - } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1707,7 +1777,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->send_credit_target = smb_direct_send_credit_target; atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); + atomic_set(&t->rw_credits, t->max_rw_credits); t->max_send_size = smb_direct_max_send_size; t->max_recv_size = smb_direct_max_receive_size; @@ -1715,12 +1785,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_send_sge = max_sge_per_wr; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = - rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * - smb_direct_max_outstanding_rw_ops; + cap->max_rdma_ctxs = t->max_rw_credits; return 0; } @@ -1813,7 +1881,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->send_cq = ib_alloc_cq(t->cm_id->device, t, - t->send_credit_target, 0, IB_POLL_WORKQUEUE); + smb_direct_send_credit_target + cap->max_rdma_ctxs, + 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->send_cq)) { pr_err("Can't create RDMA send CQ\n"); ret = PTR_ERR(t->send_cq); @@ -1822,8 +1891,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - cap->max_send_wr + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); + t->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(t->recv_cq); @@ -1852,17 +1920,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - int pages_per_mr, mr_count; - - pages_per_mr = min_t(int, pages_per_rw, - t->cm_id->device->attrs.max_fast_reg_page_list_len); - mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * - atomic_read(&t->rw_avail_ops); - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, - IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, + t->max_rw_credits, IB_MR_TYPE_MEM_REG, + t->pages_per_rw_credit, 0); if (ret) { pr_err("failed to init mr pool count %d pages %d\n", - mr_count, pages_per_mr); + t->max_rw_credits, t->pages_per_rw_credit); goto err; } } diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 5567d93a6f96..77aee4e5c9dc 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,6 +7,10 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ +#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024) +#define SMBD_MIN_IOSIZE (512 * 1024) +#define SMBD_MAX_IOSIZE (16 * 1024 * 1024) + /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; @@ -52,10 +56,14 @@ struct smb_direct_data_transfer { int ksmbd_rdma_init(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); +void init_smbd_max_io_size(unsigned int sz); +unsigned int get_smbd_max_read_write_size(void); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline int ksmbd_rdma_destroy(void) { return 0; } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } +static inline void init_smbd_max_io_size(unsigned int sz) { } +static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/namei.c b/fs/namei.c index 3dc0db2df561..1f28d3f463c3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -730,13 +730,6 @@ static bool legitimize_links(struct nameidata *nd) static bool legitimize_root(struct nameidata *nd) { - /* - * For scoped-lookups (where nd->root has been zeroed), we need to - * restart the whole lookup from scratch -- because set_root() is wrong - * for these lookups (nd->dfd is the root, not the filesystem root). - */ - if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) - return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; @@ -798,7 +791,7 @@ out: * @seq: seq number to check @dentry against * Returns: true on success, false on failure * - * Similar to to try_to_unlazy(), but here we have the next dentry already + * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and @@ -1755,7 +1748,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, seq); - if (!try_to_unlazy(nd) != 0 || !grabbed_link) + if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) @@ -2769,7 +2762,8 @@ struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, EXPORT_SYMBOL(lookup_one); /** - * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * lookup_one_unlocked - filesystem helper to lookup single pathname component + * @mnt_userns: idmapping of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to @@ -2780,14 +2774,15 @@ EXPORT_SYMBOL(lookup_one); * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ -struct dentry *lookup_one_len_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_one_unlocked(struct user_namespace *mnt_userns, + const char *name, struct dentry *base, + int len) { struct qstr this; int err; struct dentry *ret; - err = lookup_one_common(&init_user_ns, name, base, len, &this); + err = lookup_one_common(mnt_userns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2796,6 +2791,59 @@ struct dentry *lookup_one_len_unlocked(const char *name, ret = lookup_slow(&this, base, 0); return ret; } +EXPORT_SYMBOL(lookup_one_unlocked); + +/** + * lookup_one_positive_unlocked - filesystem helper to lookup single + * pathname component + * @mnt_userns: idmapping of the mount the lookup is performed from + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * The helper should be called without i_mutex held. + */ +struct dentry *lookup_one_positive_unlocked(struct user_namespace *mnt_userns, + const char *name, + struct dentry *base, int len) +{ + struct dentry *ret = lookup_one_unlocked(mnt_userns, name, base, len); + + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_unlocked); + +/** + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * Unlike lookup_one_len, it should be called without the parent + * i_mutex held, and will take the i_mutex itself if necessary. + */ +struct dentry *lookup_one_len_unlocked(const char *name, + struct dentry *base, int len) +{ + return lookup_one_unlocked(&init_user_ns, name, base, len); +} EXPORT_SYMBOL(lookup_one_len_unlocked); /* @@ -2809,12 +2857,7 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - struct dentry *ret = lookup_one_len_unlocked(name, base, len); - if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { - dput(ret); - ret = ERR_PTR(-ENOENT); - } - return ret; + return lookup_one_positive_unlocked(&init_user_ns, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); diff --git a/fs/namespace.c b/fs/namespace.c index 41461f55c039..e6a7e769d25d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1760,7 +1760,7 @@ out_unlock: /* * Is the caller allowed to modify his namespace? */ -static inline bool may_mount(void) +bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6f5425e89ca6..2d72b1b7ed74 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -206,15 +206,16 @@ static int nfs_file_fsync_commit(struct file *file, int datasync) { struct inode *inode = file_inode(file); - int ret; + int ret, ret2; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret < 0) - return ret; - return file_check_and_advance_wb_err(file); + ret2 = file_check_and_advance_wb_err(file); + if (ret2 < 0) + return ret2; + return ret; } int @@ -387,11 +388,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx, mapping->host)) { - status = nfs_wb_all(mapping->host); - if (status < 0) - return status; - } + if (nfs_ctx_key_to_expire(ctx, mapping->host)) + nfs_wb_all(mapping->host); return copied; } @@ -606,18 +604,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode, - int error) -{ - struct nfs_open_context *ctx; - - ctx = nfs_file_open_context(filp); - if (nfs_error_is_fatal_on_server(error) || - nfs_ctx_key_to_expire(ctx, inode)) - return 1; - return 0; -} - ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -645,7 +631,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) - goto out; + return result; } nfs_clear_invalid_mapping(file->f_mapping); @@ -664,6 +650,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { result = filemap_fdatawrite_range(file->f_mapping, @@ -681,17 +668,22 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) } result = generic_write_sync(iocb, written); if (result < 0) - goto out; + return result; +out: /* Return error values */ error = filemap_check_wb_err(file->f_mapping, since); - if (nfs_need_check_write(file, inode, error)) { - int err = nfs_wb_all(inode); - if (err < 0) - result = err; + switch (error) { + default: + break; + case -EDQUOT: + case -EFBIG: + case -ENOSPC: + nfs_wb_all(inode); + error = file_check_and_advance_wb_err(file); + if (error < 0) + result = error; } - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); -out: return result; out_swapfile: diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 76deddab0a8f..2b2661582bbe 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -839,7 +839,12 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (IS_ERR_OR_NULL(lseg)) + if (IS_ERR(lseg)) { + /* Fall back to MDS on recoverable errors */ + if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg))) + lseg = NULL; + goto out; + } else if (!lseg) goto out; lo = NFS_I(ino)->layout; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index f73c09a9cf0a..e861d7bae305 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -231,11 +231,10 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; struct fscache_cookie *cookie = nfs_i_fscache(inode); + loff_t i_size = i_size_read(inode); - if (fscache_cookie_valid(cookie)) { - nfs_fscache_update_auxdata(&auxdata, inode); - fscache_unuse_cookie(cookie, &auxdata, NULL); - } + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_unuse_cookie(cookie, &auxdata, &i_size); } /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7eefa16ed381..8f8cd6e2d4db 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -841,6 +841,7 @@ static inline bool nfs_error_is_fatal_on_server(int err) case 0: case -ERESTARTSYS: case -EINTR: + case -ENOMEM: return false; } return nfs_error_is_fatal(err); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 7b861e4f0533..03d3a270eff4 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -328,7 +328,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, char *read_name = NULL; int len, status = 0; - server = NFS_SERVER(ss_mnt->mnt_root->d_inode); + server = NFS_SB(ss_mnt->mnt_sb); if (!fattr) return ERR_PTR(-ENOMEM); @@ -346,7 +346,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out; snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); - r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr); + r_ino = nfs_fhget(ss_mnt->mnt_sb, src_fh, fattr); if (IS_ERR(r_ino)) { res = ERR_CAST(r_ino); goto out_free_name; diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3680c8da510c..f2dbf904c598 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -417,6 +417,9 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (!fs_locations) goto out_free; + fs_locations->fattr = nfs_alloc_fattr(); + if (!fs_locations->fattr) + goto out_free_2; /* Get locations */ dentry = ctx->clone_data.dentry; @@ -427,14 +430,16 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0) - goto out_free_2; + goto out_free_3; err = -ENOENT; if (fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) - goto out_free_2; + goto out_free_3; err = nfs_follow_referral(fc, fs_locations); +out_free_3: + kfree(fs_locations->fattr); out_free_2: kfree(fs_locations); out_free: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a79f66432bd3..c0fdcf8c0032 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1162,7 +1162,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, { unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (server->caps & NFS_CAP_MOVEABLE) task_flags = RPC_TASK_MOVEABLE; return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -2568,7 +2568,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; kref_get(&data->kref); @@ -3098,6 +3098,10 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } if (!opendata->cancelled) nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; @@ -3733,7 +3737,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -4243,6 +4247,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, if (locations == NULL) goto out; + locations->fattr = fattr; + status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; @@ -4252,17 +4258,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, * referral. Cause us to drop into the exception handler, which * will kick off migration recovery. */ - if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ - nfs_fixup_referral_attributes(&locations->fattr); - - /* replace the lookup nfs_fattr with the locations nfs_fattr */ - memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + nfs_fixup_referral_attributes(fattr); memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -4404,7 +4407,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_flags = RPC_TASK_MOVEABLE; /* Is this is an attribute revalidation, subject to softreval? */ @@ -5768,9 +5771,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred) return 0; } -static inline int nfs4_server_supports_acls(struct nfs_server *server) +static bool nfs4_server_supports_acls(const struct nfs_server *server, + enum nfs4_acl_type type) { - return server->caps & NFS_CAP_ACLS; + switch (type) { + default: + return server->attr_bitmask[0] & FATTR4_WORD0_ACL; + case NFS4ACL_DACL: + return server->attr_bitmask[1] & FATTR4_WORD1_DACL; + case NFS4ACL_SACL: + return server->attr_bitmask[1] & FATTR4_WORD1_SACL; + } } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -5809,6 +5820,7 @@ unwind: } struct nfs4_cached_acl { + enum nfs4_acl_type type; int cached; size_t len; char data[]; @@ -5829,7 +5841,8 @@ static void nfs4_zap_acl_attr(struct inode *inode) nfs4_set_cached_acl(inode, NULL); } -static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_cached_acl *acl; @@ -5839,6 +5852,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_ acl = nfsi->nfs4_acl; if (acl == NULL) goto out; + if (acl->type != type) + goto out; if (buf == NULL) /* user is just asking for length */ goto out_len; if (acl->cached == 0) @@ -5854,7 +5869,9 @@ out: return ret; } -static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, + size_t pgbase, size_t acl_len, + enum nfs4_acl_type type) { struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; @@ -5871,6 +5888,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size goto out; acl->cached = 0; } + acl->type = type; acl->len = acl_len; out: nfs4_set_cached_acl(inode, acl); @@ -5886,14 +5904,17 @@ out: * length. The next getxattr call will then produce another round trip to * the server, this time with the input buf of the required size. */ -static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct page **pages; struct nfs_getaclargs args = { .fh = NFS_FH(inode), + .acl_type = type, .acl_len = buflen, }; struct nfs_getaclres res = { + .acl_type = type, .acl_len = buflen, }; struct rpc_message msg = { @@ -5943,7 +5964,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu ret = -ERANGE; goto out_free; } - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len, + type); if (buf) { if (res.acl_len > buflen) { ret = -ERANGE; @@ -5963,14 +5985,15 @@ out_free: return ret; } -static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { .interruptible = true, }; ssize_t ret; do { - ret = __nfs4_get_acl_uncached(inode, buf, buflen); + ret = __nfs4_get_acl_uncached(inode, buf, buflen, type); trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; @@ -5979,34 +6002,37 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl return ret; } -static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, + enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); int ret; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret < 0) return ret; if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - ret = nfs4_read_cached_acl(inode, buf, buflen); + ret = nfs4_read_cached_acl(inode, buf, buflen, type); if (ret != -ENOENT) /* -ENOENT is returned if there is no ACL or if there is an ACL * but no cached acl data, just the acl length */ return ret; - return nfs4_get_acl_uncached(inode, buf, buflen); + return nfs4_get_acl_uncached(inode, buf, buflen, type); } -static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_setaclargs arg = { - .fh = NFS_FH(inode), - .acl_pages = pages, - .acl_len = buflen, + .fh = NFS_FH(inode), + .acl_type = type, + .acl_len = buflen, + .acl_pages = pages, }; struct nfs_setaclres res; struct rpc_message msg = { @@ -6020,7 +6046,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl /* You can't remove system.nfs4_acl: */ if (buflen == 0) return -EINVAL; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -6051,12 +6077,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return ret; } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { }; int err; do { - err = __nfs4_proc_set_acl(inode, buf, buflen); + err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { /* @@ -6612,10 +6639,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -6929,10 +6959,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; - struct nfs_client *client = - NFS_SERVER(lsp->ls_state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, @@ -7203,9 +7231,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; - struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -7655,21 +7682,70 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - return nfs4_proc_set_acl(inode, buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL); +} + +#if defined(CONFIG_NFS_V4_1) +#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" + +static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL); +} + +#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" + +static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL); } +static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL); +} + +#endif + #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, @@ -7902,7 +7978,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, else bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - nfs_fattr_init(&fs_locations->fattr); + nfs_fattr_init(fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); @@ -7967,7 +8043,7 @@ static int _nfs40_proc_get_locations(struct nfs_server *server, unsigned long now = jiffies; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -8032,7 +8108,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, }; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -10391,7 +10467,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 - | NFS_CAP_LGOPEN, + | NFS_CAP_LGOPEN + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10426,7 +10503,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR - | NFS_CAP_READ_PLUS, + | NFS_CAP_READ_PLUS + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10587,6 +10665,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { .set = nfs4_xattr_set_nfs4_acl, }; +#if defined(CONFIG_NFS_V4_1) +static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = { + .name = XATTR_NAME_NFSV4_DACL, + .list = nfs4_xattr_list_nfs4_dacl, + .get = nfs4_xattr_get_nfs4_dacl, + .set = nfs4_xattr_set_nfs4_dacl, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = { + .name = XATTR_NAME_NFSV4_SACL, + .list = nfs4_xattr_list_nfs4_sacl, + .get = nfs4_xattr_get_nfs4_sacl, + .set = nfs4_xattr_set_nfs4_sacl, +}; +#endif + #ifdef CONFIG_NFS_V4_2 static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { .prefix = XATTR_USER_PREFIX, @@ -10597,6 +10691,10 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#if defined(CONFIG_NFS_V4_1) + &nfs4_xattr_nfs4_dacl_handler, + &nfs4_xattr_nfs4_sacl_handler, +#endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL &nfs4_xattr_nfs4_label_handler, #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9e1c987c81e7..2540b35ec187 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1602,7 +1602,8 @@ static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, #endif /* CONFIG_NFS_V4_2 */ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, - const struct nfs4_state_recovery_ops *ops) + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_lock_state *lock; int status; @@ -1620,7 +1621,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st list_for_each_entry(lock, &state->lock_states, ls_locks) { trace_nfs4_state_lock_reclaim(state, lock); if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + *lost_locks += 1; } spin_unlock(&state->state_lock); } @@ -1630,7 +1631,9 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_state *state; unsigned int loop = 0; @@ -1666,7 +1669,7 @@ restart: #endif /* CONFIG_NFS_V4_2 */ refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = __nfs4_reclaim_open_state(sp, state, ops); + status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks); switch (status) { default: @@ -1909,6 +1912,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct rb_node *pos; LIST_HEAD(freeme); int status = 0; + int lost_locks = 0; restart: rcu_read_lock(); @@ -1928,8 +1932,11 @@ restart: spin_unlock(&clp->cl_lock); rcu_read_unlock(); - status = nfs4_reclaim_open_state(sp, ops); + status = nfs4_reclaim_open_state(sp, ops, &lost_locks); if (status < 0) { + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); status = nfs4_recovery_handle_error(clp, status); @@ -1943,6 +1950,9 @@ restart: } rcu_read_unlock(); nfs4_free_state_owners(&freeme); + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); return 0; } @@ -2106,6 +2116,11 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred dprintk("<-- %s: no memory\n", __func__); goto out; } + locations->fattr = nfs_alloc_fattr(); + if (locations->fattr == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, @@ -2120,7 +2135,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred if (!locations->nlocations) goto out; - if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); goto out; @@ -2145,6 +2160,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred out: if (page != NULL) __free_page(page); + if (locations != NULL) + kfree(locations->fattr); kfree(locations); if (result) { pr_err("NFS: migration recovery failed (server %s)\n", diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 86a5f6516928..acfe5f4bda48 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1680,19 +1680,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); } -static void -encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg, - struct compound_hdr *hdr) +static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2]) { - __be32 *p; + switch (type) { + default: + bitmap[0] = FATTR4_WORD0_ACL; + bitmap[1] = 0; + break; + case NFS4ACL_DACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_DACL; + break; + case NFS4ACL_SACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_SACL; + } +} + +static void encode_setacl(struct xdr_stream *xdr, + const struct nfs_setaclargs *arg, + struct compound_hdr *hdr) +{ + __u32 bitmap[2]; + + nfs4_acltype_to_bitmap(arg->acl_type, bitmap); encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); encode_nfs4_stateid(xdr, &zero_stateid); - p = reserve_space(xdr, 2*4); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(FATTR4_WORD0_ACL); - p = reserve_space(xdr, 4); - *p = cpu_to_be32(arg->acl_len); + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + encode_uint32(xdr, arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len); } @@ -2587,11 +2603,11 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - const __u32 nfs4_acl_bitmap[1] = { - [0] = FATTR4_WORD0_ACL, - }; + __u32 nfs4_acl_bitmap[2]; uint32_t replen; + nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap); + encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); @@ -5386,7 +5402,7 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - struct nfs_getaclres *res) + struct nfs_getaclres *res, enum nfs4_acl_type type) { unsigned int savep; uint32_t attrlen, @@ -5404,26 +5420,39 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto out; - if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) - return -EIO; - if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { - - /* The bitmap (xdr len + bitmaps) and the attr xdr len words - * are stored with the acl data to handle the problem of - * variable length bitmaps.*/ - res->acl_data_offset = xdr_page_pos(xdr); - res->acl_len = attrlen; - - /* Check for receive buffer overflow */ - if (res->acl_len > xdr_stream_remaining(xdr) || - res->acl_len + res->acl_data_offset > xdr->buf->page_len) { - res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", - attrlen, xdr_stream_remaining(xdr)); - } - } else - status = -EOPNOTSUPP; + switch (type) { + default: + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (!(bitmap[0] & FATTR4_WORD0_ACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_DACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_DACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_SACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_SACL)) + return -EOPNOTSUPP; + } + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_page_pos(xdr); + res->acl_len = attrlen; + + /* Check for receive buffer overflow */ + if (res->acl_len > xdr_stream_remaining(xdr) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); + } out: return status; } @@ -6486,7 +6515,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, res); + status = decode_getacl(xdr, rqstp, res, res->acl_type); out: return status; @@ -7051,7 +7080,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, if (res->migration) { xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); if (status) @@ -7064,7 +7093,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9157dd19b8b4..317cedfa52bf 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -767,6 +767,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .flags = RPC_TASK_ASYNC | flags, }; + if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); dprintk("NFS: initiated pgio call " diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 856c962273c7..68a87be3e6f9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2000,6 +2000,7 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; @@ -2128,6 +2129,7 @@ lookup_again: lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); if (!lgp) { + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, PNFS_UPDATE_LAYOUT_NOMEM); nfs_layoutget_end(lo); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 6f325e10056c..9697cd5d2561 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -102,6 +102,10 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); + + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); @@ -344,6 +348,10 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; + if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) && + nfs_server_capable(new_dir, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f00d45cf80ef..1c706465d090 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -603,8 +603,9 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static int nfs_page_async_flush(struct page *page, + struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; int ret = 0; @@ -630,11 +631,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, /* * Remove the problematic req upon fatal errors on the server */ - if (nfs_error_is_fatal(ret)) { - if (nfs_error_is_fatal_on_server(ret)) - goto out_launder; - } else - ret = -EAGAIN; + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; + if (wbc->sync_mode == WB_SYNC_NONE) + ret = AOP_WRITEPAGE_ACTIVATE; + redirty_page_for_writepage(wbc, page); nfs_redirty_request(req); pgio->pg_error = 0; } else @@ -650,15 +651,8 @@ out_launder: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - int ret; - nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); - ret = AOP_WRITEPAGE_ACTIVATE; - } - return ret; + return nfs_page_async_flush(page, wbc, pgio); } /* @@ -681,11 +675,7 @@ static int nfs_writepage_locked(struct page *page, err = nfs_do_writepage(page, wbc, &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); - if (err < 0) - return err; - if (nfs_error_is_fatal(pgio.pg_error)) - return pgio.pg_error; - return 0; + return err; } int nfs_writepage(struct page *page, struct writeback_control *wbc) @@ -737,19 +727,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) priority = wb_priority(wbc); } - nfs_pageio_init_write(&pgio, inode, priority, false, - &nfs_async_write_completion_ops); - pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); - pgio.pg_error = 0; - nfs_pageio_complete(&pgio); + do { + nfs_pageio_init_write(&pgio, inode, priority, false, + &nfs_async_write_completion_ops); + pgio.pg_io_completion = ioc; + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, + &pgio); + pgio.pg_error = 0; + nfs_pageio_complete(&pgio); + } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); if (err < 0) goto out_err; - err = pgio.pg_error; - if (nfs_error_is_fatal(err)) - goto out_err; return 0; out_err: return err; @@ -1444,7 +1434,7 @@ static void nfs_async_write_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - if (nfs_error_is_fatal(error)) + if (nfs_error_is_fatal_on_server(error)) nfs_write_error(req, error); else nfs_redirty_request(req); @@ -1719,6 +1709,10 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; + + if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client); trace_nfs_initiate_commit(data); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a4fcdc7927ca..8e9d2b35175f 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -492,7 +492,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, - &new_valid, true, NULL); + &new_valid, ni->mi.sbi->options->prealloc, NULL); up_write(&ni->file.run_lock); if (new_valid < ni->i_valid) @@ -659,7 +659,13 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) /* * Normal file: Allocate clusters, do not change 'valid' size. */ - err = ntfs_set_size(inode, max(end, i_size)); + loff_t new_size = max(end, i_size); + + err = inode_newsize_ok(inode, new_size); + if (err) + goto out; + + err = ntfs_set_size(inode, new_size); if (err) goto out; @@ -759,7 +765,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } inode_dio_wait(inode); - if (attr->ia_size < oldsize) + if (attr->ia_size <= oldsize) err = ntfs_truncate(inode, attr->ia_size); else if (attr->ia_size > oldsize) err = ntfs_extend(inode, attr->ia_size, 0, NULL); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 6f47a9c17f89..18842998c8fa 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -1964,10 +1964,8 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, vcn += clen; - if (vbo + bytes >= end) { + if (vbo + bytes >= end) bytes = end - vbo; - flags |= FIEMAP_EXTENT_LAST; - } if (vbo + bytes <= valid) { ; @@ -1977,6 +1975,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, /* vbo < valid && valid < vbo + bytes */ u64 dlen = valid - vbo; + if (vbo + dlen >= end) + flags |= FIEMAP_EXTENT_LAST; + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, flags); if (err < 0) @@ -1995,6 +1996,9 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_UNWRITTEN; } + if (vbo + bytes >= end) + flags |= FIEMAP_EXTENT_LAST; + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); if (err < 0) break; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 06492f088d60..49b7df616778 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -1185,8 +1185,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, if (!r_page) return -ENOMEM; - memset(info, 0, sizeof(struct restart_info)); - /* Determine which restart area we are looking for. */ if (first) { vbo = 0; @@ -3791,10 +3789,11 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) if (!log) return -ENOMEM; + memset(&rst_info, 0, sizeof(struct restart_info)); + log->ni = ni; log->l_size = l_size; log->one_page_buf = kmalloc(page_size, GFP_NOFS); - if (!log->one_page_buf) { err = -ENOMEM; goto out; @@ -3842,6 +3841,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) if (rst_info.vbo) goto check_restart_area; + memset(&rst_info2, 0, sizeof(struct restart_info)); err = log_read_rst(log, l_size, false, &rst_info2); /* Determine which restart area to use. */ @@ -4085,8 +4085,10 @@ process_log: if (client == LFS_NO_CLIENT_LE) { /* Insert "NTFS" client LogFile. */ client = ra->client_idx[0]; - if (client == LFS_NO_CLIENT_LE) - return -EINVAL; + if (client == LFS_NO_CLIENT_LE) { + err = -EINVAL; + goto out; + } t16 = le16_to_cpu(client); cr = ca + t16; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 74f60c457f28..be4ebdd8048b 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -758,6 +758,7 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t vbo = iocb->ki_pos; loff_t end; int wr = iov_iter_rw(iter) & WRITE; + size_t iter_count = iov_iter_count(iter); loff_t valid; ssize_t ret; @@ -771,10 +772,13 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) wr ? ntfs_get_block_direct_IO_W : ntfs_get_block_direct_IO_R); - if (ret <= 0) + if (ret > 0) + end = vbo + ret; + else if (wr && ret == -EIOCBQUEUED) + end = vbo + iter_count; + else goto out; - end = vbo + ret; valid = ni->i_valid; if (wr) { if (end > valid && !S_ISBLK(inode->i_mode)) { @@ -1950,6 +1954,7 @@ const struct address_space_operations ntfs_aops = { .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, }; const struct address_space_operations ntfs_aops_cmpr = { diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index afd0ddad826f..5e0e0280e70d 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -112,7 +112,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, return -ENOMEM; if (!size) { - ; + /* EA info persists, but xattr is empty. Looks like EA problem. */ } else if (attr_ea->non_res) { struct runs_tree run; @@ -259,7 +259,7 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags) + size_t val_size, int flags, bool locked) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -278,7 +278,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, u64 new_sz; void *p; - ni_lock(ni); + if (!locked) + ni_lock(ni); run_init(&ea_run); @@ -467,7 +468,8 @@ update_ea: mark_inode_dirty(&ni->vfs_inode); out: - ni_unlock(ni); + if (!locked) + ni_unlock(ni); run_close(&ea_run); kfree(ea_all); @@ -541,7 +543,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, - int type) + int type, bool init_acl) { const char *name; size_t size, name_len; @@ -554,8 +556,9 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, switch (type) { case ACL_TYPE_ACCESS: - if (acl) { - umode_t mode = inode->i_mode; + /* Do not change i_mode if we are in init_acl */ + if (acl && !init_acl) { + umode_t mode; err = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); @@ -598,7 +601,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, flags); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); if (err == -ENODATA && !size) err = 0; /* Removing non existed xattr. */ if (!err) @@ -616,7 +619,68 @@ out: int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type); + return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); +} + +static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, void *buffer, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + acl = ntfs_get_acl(inode, type, false); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (!acl) + return -ENODATA; + + err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); + posix_acl_release(acl); + + return err; +} + +static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, const void *value, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EPERM; + + if (!value) { + acl = NULL; + } else { + acl = posix_acl_from_xattr(mnt_userns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + err = posix_acl_valid(mnt_userns, acl); + if (err) + goto release_and_out; + } + } + + err = ntfs_set_acl(mnt_userns, inode, acl, type); + +release_and_out: + posix_acl_release(acl); + return err; } /* @@ -636,7 +700,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, if (default_acl) { err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, - ACL_TYPE_DEFAULT); + ACL_TYPE_DEFAULT, true); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; @@ -647,7 +711,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, else { if (!err) err = ntfs_set_acl_ex(mnt_userns, inode, acl, - ACL_TYPE_ACCESS); + ACL_TYPE_ACCESS, true); posix_acl_release(acl); } @@ -785,6 +849,23 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || + (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { + /* TODO: init_user_ns? */ + err = ntfs_xattr_get_acl( + &init_user_ns, inode, + name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + buffer, size); + goto out; + } +#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -897,10 +978,29 @@ set_new_fa: goto out; } +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || + (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { + err = ntfs_xattr_set_acl( + mnt_userns, inode, + name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + value, size); + goto out; + } +#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); out: + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + return err; } @@ -913,35 +1013,37 @@ int ntfs_save_wsl_perm(struct inode *inode) { int err; __le32 value; + struct ntfs_inode *ni = ntfs_i(inode); - /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */ + ni_lock(ni); value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); /* true == already locked. */ if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0); + sizeof(value), 0, true); if (err) goto out; } out: + ni_unlock(ni); /* In case of error should we delete all WSL xattr? */ return err; } diff --git a/fs/open.c b/fs/open.c index be849dcca032..1d57fbde2feb 100644 --- a/fs/open.c +++ b/fs/open.c @@ -224,6 +224,21 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) } #endif /* BITS_PER_LONG == 32 */ +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) +COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, + compat_arg_u64_dual(length)) +{ + return ksys_truncate(pathname, compat_arg_u64_glue(length)); +} +#endif + +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) +COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, + compat_arg_u64_dual(length)) +{ + return ksys_ftruncate(fd, compat_arg_u64_glue(length)); +} +#endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -339,6 +354,15 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return ksys_fallocate(fd, mode, offset, len); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) +COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), + compat_arg_u64_dual(len)) +{ + return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), + compat_arg_u64_glue(len)); +} +#endif + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index e040970408d4..714ec569d25b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -44,9 +44,9 @@ static bool ovl_must_copy_xattr(const char *name) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } -int ovl_copy_xattr(struct super_block *sb, struct dentry *old, - struct dentry *new) +int ovl_copy_xattr(struct super_block *sb, struct path *oldpath, struct dentry *new) { + struct dentry *old = oldpath->dentry; ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int error = 0; @@ -94,9 +94,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, continue; /* Discard */ } retry: - size = vfs_getxattr(&init_user_ns, old, name, value, value_size); + size = ovl_do_getxattr(oldpath, name, value, value_size); if (size == -ERANGE) - size = vfs_getxattr(&init_user_ns, old, name, NULL, 0); + size = ovl_do_getxattr(oldpath, name, NULL, 0); if (size < 0) { error = size; @@ -117,7 +117,7 @@ retry: goto retry; } - error = vfs_setxattr(&init_user_ns, new, name, value, size, 0); + error = ovl_do_setxattr(OVL_FS(sb), new, name, value, size, 0); if (error) { if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) break; @@ -292,17 +292,19 @@ out_fput: return error; } -static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat) +static int ovl_set_size(struct ovl_fs *ofs, + struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { .ia_valid = ATTR_SIZE, .ia_size = stat->size, }; - return notify_change(&init_user_ns, upperdentry, &attr, NULL); + return ovl_do_notify_change(ofs, upperdentry, &attr); } -static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, + struct kstat *stat) { struct iattr attr = { .ia_valid = @@ -311,10 +313,11 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) .ia_mtime = stat->mtime, }; - return notify_change(&init_user_ns, upperdentry, &attr, NULL); + return ovl_do_notify_change(ofs, upperdentry, &attr); } -int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, + struct kstat *stat) { int err = 0; @@ -323,7 +326,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_valid = ATTR_MODE, .ia_mode = stat->mode, }; - err = notify_change(&init_user_ns, upperdentry, &attr, NULL); + err = ovl_do_notify_change(ofs, upperdentry, &attr); } if (!err) { struct iattr attr = { @@ -331,10 +334,10 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_uid = stat->uid, .ia_gid = stat->gid, }; - err = notify_change(&init_user_ns, upperdentry, &attr, NULL); + err = ovl_do_notify_change(ofs, upperdentry, &attr); } if (!err) - ovl_set_timestamps(upperdentry, stat); + ovl_set_timestamps(ofs, upperdentry, stat); return err; } @@ -433,7 +436,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len); + err = ovl_setxattr(ofs, index, OVL_XATTR_UPPER, fh->buf, fh->fb.len); kfree(fh); return err; @@ -474,7 +477,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (err) return err; - temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0)); + temp = ovl_create_temp(ofs, indexdir, OVL_CATTR(S_IFDIR | 0)); err = PTR_ERR(temp); if (IS_ERR(temp)) goto free_name; @@ -483,16 +486,16 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (err) goto out; - index = lookup_one_len(name.name, indexdir, name.len); + index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); } else { - err = ovl_do_rename(dir, temp, dir, index, 0); + err = ovl_do_rename(ofs, dir, temp, dir, index, 0); dput(index); } out: if (err) - ovl_cleanup(dir, temp); + ovl_cleanup(ofs, dir, temp); dput(temp); free_name: kfree(name.name); @@ -519,6 +522,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) int err; struct dentry *upper; struct dentry *upperdir = ovl_dentry_upper(c->parent); + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(upperdir); /* Mark parent "impure" because it may now contain non-pure upper */ @@ -531,16 +535,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) return err; inode_lock_nested(udir, I_MUTEX_PARENT); - upper = lookup_one_len(c->dentry->d_name.name, upperdir, - c->dentry->d_name.len); + upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, + c->dentry->d_name.len); err = PTR_ERR(upper); if (!IS_ERR(upper)) { - err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper); + err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); dput(upper); if (!err) { /* Restore timestamps on parent (best effort) */ - ovl_set_timestamps(upperdir, &c->pstat); + ovl_set_timestamps(ofs, upperdir, &c->pstat); ovl_dentry_set_upper_alias(c->dentry); } } @@ -578,7 +582,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - err = ovl_copy_xattr(c->dentry->d_sb, c->lowerpath.dentry, temp); + err = ovl_copy_xattr(c->dentry->d_sb, &c->lowerpath, temp); if (err) return err; @@ -614,9 +618,9 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) inode_lock(temp->d_inode); if (S_ISREG(c->stat.mode)) - err = ovl_set_size(temp, &c->stat); + err = ovl_set_size(ofs, temp, &c->stat); if (!err) - err = ovl_set_attr(temp, &c->stat); + err = ovl_set_attr(ofs, temp, &c->stat); inode_unlock(temp->d_inode); return err; @@ -656,6 +660,7 @@ static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) */ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) { + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir); struct dentry *temp, *upper; @@ -677,7 +682,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (err) goto unlock; - temp = ovl_create_temp(c->workdir, &cattr); + temp = ovl_create_temp(ofs, c->workdir, &cattr); ovl_revert_cu_creds(&cc); err = PTR_ERR(temp); @@ -694,12 +699,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) goto cleanup; } - upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, + c->destname.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto cleanup; - err = ovl_do_rename(wdir, temp, udir, upper, 0); + err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0); dput(upper); if (err) goto cleanup; @@ -716,7 +722,7 @@ unlock: return err; cleanup: - ovl_cleanup(wdir, temp); + ovl_cleanup(ofs, wdir, temp); dput(temp); goto unlock; } @@ -724,6 +730,7 @@ cleanup: /* Copyup using O_TMPFILE which does not require cross dir locking */ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) { + struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; struct ovl_cu_creds cc; @@ -733,7 +740,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; - temp = ovl_do_tmpfile(c->workdir, c->stat.mode); + temp = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); ovl_revert_cu_creds(&cc); if (IS_ERR(temp)) @@ -745,10 +752,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_lock_nested(udir, I_MUTEX_PARENT); - upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len); + upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, + c->destname.len); err = PTR_ERR(upper); if (!IS_ERR(upper)) { - err = ovl_do_link(temp, udir, upper); + err = ovl_do_link(ofs, temp, udir, upper); dput(upper); } inode_unlock(udir); @@ -836,7 +844,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) /* Restore timestamps on parent (best effort) */ inode_lock(udir); - ovl_set_timestamps(c->destdir, &c->pstat); + ovl_set_timestamps(ofs, c->destdir, &c->pstat); inode_unlock(udir); ovl_dentry_set_upper_alias(c->dentry); @@ -865,12 +873,12 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, return true; } -static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) +static ssize_t ovl_getxattr_value(struct path *path, char *name, char **value) { ssize_t res; char *buf; - res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); + res = ovl_do_getxattr(path, name, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) res = 0; @@ -879,7 +887,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) if (!buf) return -ENOMEM; - res = vfs_getxattr(&init_user_ns, dentry, name, buf, res); + res = ovl_do_getxattr(path, name, buf, res); if (res < 0) kfree(buf); else @@ -906,8 +914,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) return -EIO; if (c->stat.size) { - err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, - &capability); + err = cap_size = ovl_getxattr_value(&upperpath, XATTR_NAME_CAPS, + &capability); if (cap_size < 0) goto out; } @@ -921,14 +929,14 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * don't want that to happen for normal copy-up operation. */ if (capability) { - err = vfs_setxattr(&init_user_ns, upperpath.dentry, - XATTR_NAME_CAPS, capability, cap_size, 0); + err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); if (err) goto out_free; } - err = ovl_do_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); + err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); if (err) goto out_free; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index f18490813170..6b03457f72bb 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -23,15 +23,15 @@ MODULE_PARM_DESC(redirect_max, static int ovl_set_redirect(struct dentry *dentry, bool samedir); -int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry) { int err; dget(wdentry); if (d_is_dir(wdentry)) - err = ovl_do_rmdir(wdir, wdentry); + err = ovl_do_rmdir(ofs, wdir, wdentry); else - err = ovl_do_unlink(wdir, wdentry); + err = ovl_do_unlink(ofs, wdir, wdentry); dput(wdentry); if (err) { @@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) return err; } -struct dentry *ovl_lookup_temp(struct dentry *workdir) +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -51,7 +51,7 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir) /* counter is allowed to wrap, since temp dentries are ephemeral */ snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); - temp = lookup_one_len(name, workdir, strlen(name)); + temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { pr_err("workdir/%s already exists\n", name); dput(temp); @@ -70,11 +70,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) struct inode *wdir = workdir->d_inode; if (!ofs->whiteout) { - whiteout = ovl_lookup_temp(workdir); + whiteout = ovl_lookup_temp(ofs, workdir); if (IS_ERR(whiteout)) goto out; - err = ovl_do_whiteout(wdir, whiteout); + err = ovl_do_whiteout(ofs, wdir, whiteout); if (err) { dput(whiteout); whiteout = ERR_PTR(err); @@ -84,11 +84,11 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) } if (ofs->share_whiteout) { - whiteout = ovl_lookup_temp(workdir); + whiteout = ovl_lookup_temp(ofs, workdir); if (IS_ERR(whiteout)) goto out; - err = ovl_do_link(ofs->whiteout, wdir, whiteout); + err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); if (!err) goto out; @@ -122,27 +122,28 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_do_rename(wdir, whiteout, dir, dentry, flags); + err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags); if (err) goto kill_whiteout; if (flags) - ovl_cleanup(wdir, dentry); + ovl_cleanup(ofs, wdir, dentry); out: dput(whiteout); return err; kill_whiteout: - ovl_cleanup(wdir, whiteout); + ovl_cleanup(ofs, wdir, whiteout); goto out; } -int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) +int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry **newdentry, umode_t mode) { int err; struct dentry *d, *dentry = *newdentry; - err = ovl_do_mkdir(dir, dentry, mode); + err = ovl_do_mkdir(ofs, dir, dentry, mode); if (err) return err; @@ -154,8 +155,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) * to it unhashed and negative. If that happens, try to * lookup a new hashed and positive dentry. */ - d = lookup_one_len(dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); + d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); if (IS_ERR(d)) { pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", dentry, err); @@ -167,8 +168,8 @@ int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode) return 0; } -struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct ovl_cattr *attr) +struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry *newdentry, struct ovl_cattr *attr) { int err; @@ -180,28 +181,28 @@ struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, goto out; if (attr->hardlink) { - err = ovl_do_link(attr->hardlink, dir, newdentry); + err = ovl_do_link(ofs, attr->hardlink, dir, newdentry); } else { switch (attr->mode & S_IFMT) { case S_IFREG: - err = ovl_do_create(dir, newdentry, attr->mode); + err = ovl_do_create(ofs, dir, newdentry, attr->mode); break; case S_IFDIR: /* mkdir is special... */ - err = ovl_mkdir_real(dir, &newdentry, attr->mode); + err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - err = ovl_do_mknod(dir, newdentry, attr->mode, + err = ovl_do_mknod(ofs, dir, newdentry, attr->mode, attr->rdev); break; case S_IFLNK: - err = ovl_do_symlink(dir, newdentry, attr->link); + err = ovl_do_symlink(ofs, dir, newdentry, attr->link); break; default: @@ -223,10 +224,11 @@ out: return newdentry; } -struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) +struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, + struct ovl_cattr *attr) { - return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir), - attr); + return ovl_create_real(ofs, d_inode(workdir), + ovl_lookup_temp(ofs, workdir), attr); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -330,10 +332,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, attr->mode &= ~current_umask(); inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = ovl_create_real(udir, - lookup_one_len(dentry->d_name.name, - upperdir, - dentry->d_name.len), + newdentry = ovl_create_real(ofs, udir, + ovl_lookup_upper(ofs, dentry->d_name.name, + upperdir, dentry->d_name.len), attr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) @@ -353,7 +354,7 @@ out_unlock: return err; out_cleanup: - ovl_cleanup(udir, newdentry); + ovl_cleanup(ofs, udir, newdentry); dput(newdentry); goto out_unlock; } @@ -361,6 +362,7 @@ out_cleanup: static struct dentry *ovl_clear_empty(struct dentry *dentry, struct list_head *list) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); @@ -391,12 +393,12 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode)); + opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode)); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; - err = ovl_copy_xattr(dentry->d_sb, upper, opaquedir); + err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir); if (err) goto out_cleanup; @@ -405,17 +407,17 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, goto out_cleanup; inode_lock(opaquedir->d_inode); - err = ovl_set_attr(opaquedir, &stat); + err = ovl_set_attr(ofs, opaquedir, &stat); inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; - err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; - ovl_cleanup_whiteouts(upper, list); - ovl_cleanup(wdir, upper); + ovl_cleanup_whiteouts(ofs, upper, list); + ovl_cleanup(ofs, wdir, upper); unlock_rename(workdir, upperdir); /* dentry's upper doesn't match now, get rid of it */ @@ -424,7 +426,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, return opaquedir; out_cleanup: - ovl_cleanup(wdir, opaquedir); + ovl_cleanup(ofs, wdir, opaquedir); dput(opaquedir); out_unlock: unlock_rename(workdir, upperdir); @@ -432,8 +434,8 @@ out: return ERR_PTR(err); } -static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, - const struct posix_acl *acl) +static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry, + const char *name, const struct posix_acl *acl) { void *buffer; size_t size; @@ -451,7 +453,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, if (err < 0) goto out_free; - err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE); + err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE); out_free: kfree(buffer); return err; @@ -460,6 +462,7 @@ out_free: static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct ovl_cattr *cattr) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); @@ -484,8 +487,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto out_unlock; @@ -494,7 +497,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) goto out_dput; - newdentry = ovl_create_temp(workdir, cattr); + newdentry = ovl_create_temp(ofs, workdir, cattr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_dput; @@ -510,19 +513,19 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, .ia_mode = cattr->mode, }; inode_lock(newdentry->d_inode); - err = notify_change(&init_user_ns, newdentry, &attr, NULL); + err = ovl_do_notify_change(ofs, newdentry, &attr); inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; } if (!hardlink) { - err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS, - acl); + err = ovl_set_upper_acl(ofs, newdentry, + XATTR_NAME_POSIX_ACL_ACCESS, acl); if (err) goto out_cleanup; - err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT, - default_acl); + err = ovl_set_upper_acl(ofs, newdentry, + XATTR_NAME_POSIX_ACL_DEFAULT, default_acl); if (err) goto out_cleanup; } @@ -532,20 +535,20 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(wdir, newdentry, udir, upper, + err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, RENAME_EXCHANGE); if (err) goto out_cleanup; - ovl_cleanup(wdir, upper); + ovl_cleanup(ofs, wdir, upper); } else { - err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0); if (err) goto out_cleanup; } err = ovl_instantiate(dentry, inode, newdentry, hardlink); if (err) { - ovl_cleanup(udir, newdentry); + ovl_cleanup(ofs, udir, newdentry); dput(newdentry); } out_dput: @@ -560,7 +563,7 @@ out: return err; out_cleanup: - ovl_cleanup(wdir, newdentry); + ovl_cleanup(ofs, wdir, newdentry); dput(newdentry); goto out_dput; } @@ -767,8 +770,8 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, if (err) goto out_dput; - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto out_unlock; @@ -800,6 +803,7 @@ out: static int ovl_remove_upper(struct dentry *dentry, bool is_dir, struct list_head *list) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *dir = upperdir->d_inode; struct dentry *upper; @@ -814,8 +818,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, } inode_lock_nested(dir, I_MUTEX_PARENT); - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto out_unlock; @@ -826,9 +830,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out_dput_upper; if (is_dir) - err = vfs_rmdir(&init_user_ns, dir, upper); + err = ovl_do_rmdir(ofs, dir, upper); else - err = vfs_unlink(&init_user_ns, dir, upper, NULL); + err = ovl_do_unlink(ofs, dir, upper); ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); /* @@ -880,7 +884,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; const struct cred *old_cred; - struct dentry *upperdentry; bool lower_positive = ovl_lower_positive(dentry); LIST_HEAD(list); @@ -923,9 +926,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) * Note: we fail to update ctime if there was no copy-up, only a * whiteout */ - upperdentry = ovl_dentry_upper(dentry); - if (upperdentry) - ovl_copyattr(d_inode(upperdentry), d_inode(dentry)); + if (ovl_dentry_upper(dentry)) + ovl_copyattr(d_inode(dentry)); out_drop_write: ovl_drop_write(dentry); @@ -1095,6 +1097,7 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, bool samedir = olddir == newdir; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; + struct ovl_fs *ofs = OVL_FS(old->d_sb); LIST_HEAD(list); err = -EINVAL; @@ -1189,8 +1192,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, trap = lock_rename(new_upperdir, old_upperdir); - olddentry = lookup_one_len(old->d_name.name, old_upperdir, - old->d_name.len); + olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, + old->d_name.len); err = PTR_ERR(olddentry); if (IS_ERR(olddentry)) goto out_unlock; @@ -1199,8 +1202,8 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, if (!ovl_matches_upper(old, olddentry)) goto out_dput_old; - newdentry = lookup_one_len(new->d_name.name, new_upperdir, - new->d_name.len); + newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, + new->d_name.len); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_dput_old; @@ -1251,13 +1254,13 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, if (err) goto out_dput; - err = ovl_do_rename(old_upperdir->d_inode, olddentry, + err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry, new_upperdir->d_inode, newdentry, flags); if (err) goto out_dput; if (cleanup_whiteout) - ovl_cleanup(old_upperdir->d_inode, newdentry); + ovl_cleanup(ofs, old_upperdir->d_inode, newdentry); if (overwrite && d_inode(new)) { if (new_is_dir) @@ -1272,9 +1275,9 @@ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, (d_inode(new) && ovl_type_origin(new))); /* copy ctime: */ - ovl_copyattr(d_inode(olddentry), d_inode(old)); + ovl_copyattr(d_inode(old)); if (d_inode(new) && ovl_dentry_upper(new)) - ovl_copyattr(d_inode(newdentry), d_inode(new)); + ovl_copyattr(d_inode(new)); out_dput: dput(newdentry); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index ebde05c9cf62..2eada97bbd23 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -391,6 +391,11 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, * pointer because we hold no lock on the real dentry. */ take_dentry_name_snapshot(&name, real); + /* + * No mnt_userns handling here: it's an internal lookup. Could skip + * permission checking altogether, but for now just use non-mnt_userns + * transformed ids. + */ this = lookup_one_len(name.name.name, connected, name.name.len); release_dentry_name_snapshot(&name); err = PTR_ERR(this); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 9d69b4dbb8c4..daff601b5c41 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -38,9 +38,11 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) #define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) static struct file *ovl_open_realfile(const struct file *file, - struct inode *realinode) + struct path *realpath) { + struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); + struct user_namespace *real_mnt_userns; struct file *realfile; const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; @@ -51,11 +53,12 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode); + real_mnt_userns = mnt_user_ns(realpath->mnt); + err = inode_permission(real_mnt_userns, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(&init_user_ns, realinode)) + if (!inode_owner_or_capable(real_mnt_userns, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, @@ -101,21 +104,21 @@ static int ovl_change_flags(struct file *file, unsigned int flags) static int ovl_real_fdget_meta(const struct file *file, struct fd *real, bool allow_meta) { - struct inode *inode = file_inode(file); - struct inode *realinode; + struct dentry *dentry = file_dentry(file); + struct path realpath; real->flags = 0; real->file = file->private_data; if (allow_meta) - realinode = ovl_inode_real(inode); + ovl_path_real(dentry, &realpath); else - realinode = ovl_inode_realdata(inode); + ovl_path_realdata(dentry, &realpath); /* Has it been copied up since we'd opened it? */ - if (unlikely(file_inode(real->file) != realinode)) { + if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) { real->flags = FDPUT_FPUT; - real->file = ovl_open_realfile(file, realinode); + real->file = ovl_open_realfile(file, &realpath); return PTR_ERR_OR_ZERO(real->file); } @@ -141,17 +144,20 @@ static int ovl_real_fdget(const struct file *file, struct fd *real) static int ovl_open(struct inode *inode, struct file *file) { + struct dentry *dentry = file_dentry(file); struct file *realfile; + struct path realpath; int err; - err = ovl_maybe_copy_up(file_dentry(file), file->f_flags); + err = ovl_maybe_copy_up(dentry, file->f_flags); if (err) return err; /* No longer need these flags, so don't pass them on to underlying fs */ file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - realfile = ovl_open_realfile(file, ovl_inode_realdata(inode)); + ovl_path_realdata(dentry, &realpath); + realfile = ovl_open_realfile(file, &realpath); if (IS_ERR(realfile)) return PTR_ERR(realfile); @@ -270,7 +276,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, SB_FREEZE_WRITE); file_end_write(iocb->ki_filp); - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); } orig_iocb->ki_pos = iocb->ki_pos; @@ -352,7 +358,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); ret = file_remove_privs(file); if (ret) goto out_unlock; @@ -376,7 +382,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) ovl_iocb_to_rwf(ifl)); file_end_write(real.file); /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); } else { struct ovl_aio_req *aio_req; @@ -426,12 +432,11 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, struct fd real; const struct cred *old_cred; struct inode *inode = file_inode(out); - struct inode *realinode = ovl_inode_real(inode); ssize_t ret; inode_lock(inode); /* Update mode */ - ovl_copyattr(realinode, inode); + ovl_copyattr(inode); ret = file_remove_privs(out); if (ret) goto out_unlock; @@ -447,7 +452,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, file_end_write(real.file); /* Update size */ - ovl_copyattr(realinode, inode); + ovl_copyattr(inode); revert_creds(old_cred); fdput(real); @@ -521,7 +526,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len revert_creds(old_cred); /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); fdput(real); @@ -593,7 +598,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, revert_creds(old_cred); /* Update size */ - ovl_copyattr(ovl_inode_real(inode_out), inode_out); + ovl_copyattr(inode_out); fdput(real_in); fdput(real_out); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 1f36158c7dbe..492eddeb481f 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -21,6 +21,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool full_copy_up = false; struct dentry *upperdentry; const struct cred *old_cred; @@ -77,10 +78,10 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); - err = notify_change(&init_user_ns, upperdentry, attr, NULL); + err = ovl_do_notify_change(ofs, upperdentry, attr); revert_creds(old_cred); if (!err) - ovl_copyattr(upperdentry->d_inode, dentry->d_inode); + ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); if (winode) @@ -279,12 +280,14 @@ int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); - struct inode *realinode = upperinode ?: ovl_inode_lower(inode); + struct inode *realinode; + struct path realpath; const struct cred *old_cred; int err; /* Careful in RCU walk mode */ - if (!realinode) { + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { WARN_ON(!(mask & MAY_NOT_BLOCK)); return -ECHILD; } @@ -297,6 +300,7 @@ int ovl_permission(struct user_namespace *mnt_userns, if (err) return err; + realinode = d_inode(realpath.dentry); old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { @@ -304,7 +308,7 @@ int ovl_permission(struct user_namespace *mnt_userns, /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(&init_user_ns, realinode, mask); + err = inode_permission(mnt_user_ns(realpath.mnt), realinode, mask); revert_creds(old_cred); return err; @@ -342,8 +346,10 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_i_dentry_upper(inode); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + struct path realpath; const struct cred *old_cred; err = ovl_want_write(dentry); @@ -351,8 +357,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, goto out; if (!value && !upperdentry) { + ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); + err = vfs_getxattr(mnt_user_ns(realpath.mnt), realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -367,17 +374,17 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, } old_cred = ovl_override_creds(dentry->d_sb); - if (value) - err = vfs_setxattr(&init_user_ns, realdentry, name, value, size, - flags); - else { + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, + flags); + } else { WARN_ON(flags != XATTR_REPLACE); - err = vfs_removexattr(&init_user_ns, realdentry, name); + err = ovl_do_removexattr(ofs, realdentry, name); } revert_creds(old_cred); /* copy c/mtime */ - ovl_copyattr(d_inode(realdentry), inode); + ovl_copyattr(inode); out_drop_write: ovl_drop_write(dentry); @@ -390,11 +397,11 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, { ssize_t res; const struct cred *old_cred; - struct dentry *realdentry = - ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); + struct path realpath; + ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); + res = vfs_getxattr(mnt_user_ns(realpath.mnt), realpath.dentry, name, value, size); revert_creds(old_cred); return res; } @@ -535,7 +542,7 @@ int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa) if (err) return err; - return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa); + return vfs_fileattr_set(mnt_user_ns(realpath->mnt), realpath->dentry, fa); } int ovl_fileattr_set(struct user_namespace *mnt_userns, @@ -579,7 +586,7 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); /* Update ctime */ - ovl_copyattr(ovl_inode_real(inode), inode); + ovl_copyattr(inode); } ovl_drop_write(dentry); out: @@ -777,16 +784,19 @@ void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, unsigned long ino, int fsid) { struct inode *realinode; + struct ovl_inode *oi = OVL_I(inode); if (oip->upperdentry) - OVL_I(inode)->__upperdentry = oip->upperdentry; - if (oip->lowerpath && oip->lowerpath->dentry) - OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry)); + oi->__upperdentry = oip->upperdentry; + if (oip->lowerpath && oip->lowerpath->dentry) { + oi->lowerpath.dentry = dget(oip->lowerpath->dentry); + oi->lowerpath.layer = oip->lowerpath->layer; + } if (oip->lowerdata) - OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata)); + oi->lowerdata = igrab(d_inode(oip->lowerdata)); realinode = ovl_inode_real(inode); - ovl_copyattr(realinode, inode); + ovl_copyattr(inode); ovl_copyflags(realinode, inode); ovl_map_ino(inode, ino, fsid); } @@ -871,8 +881,8 @@ static int ovl_set_nlink_common(struct dentry *dentry, if (WARN_ON(len >= sizeof(buf))) return -EIO; - return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), - OVL_XATTR_NLINK, buf, len); + return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), + OVL_XATTR_NLINK, buf, len); } int ovl_set_nlink_upper(struct dentry *dentry) @@ -897,8 +907,8 @@ unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) return fallback; - err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK, - &buf, sizeof(buf) - 1); + err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, + &buf, sizeof(buf) - 1); if (err < 0) goto fail; @@ -1102,6 +1112,10 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; + struct path realpath = { + .dentry = upperdentry ?: lowerdentry, + .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, + }; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); int fsid = bylower ? lowerpath->layer->fsid : 0; @@ -1175,7 +1189,7 @@ struct inode *ovl_get_inode(struct super_block *sb, /* Check for non-merge dir that may have whiteouts */ if (is_dir) { if (((upperdentry && lowerdentry) || oip->numlower > 1) || - ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) { + ovl_path_check_origin_xattr(ofs, &realpath)) { ovl_set_flag(OVL_WHITEOUTS, inode); } } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 1a9b515fc45d..65c4346a5b43 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -16,6 +16,7 @@ struct ovl_lookup_data { struct super_block *sb; + struct vfsmount *mnt; struct qstr name; bool is_dir; bool opaque; @@ -25,14 +26,14 @@ struct ovl_lookup_data { bool metacopy; }; -static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, +static int ovl_check_redirect(struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; char *buf; struct ovl_fs *ofs = OVL_FS(d->sb); - buf = ovl_get_redirect_xattr(ofs, dentry, prelen + strlen(post)); + buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); @@ -105,13 +106,13 @@ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) return 0; } -static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry, +static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox) { int res, err; struct ovl_fh *fh = NULL; - res = ovl_do_getxattr(ofs, dentry, ox, NULL, 0); + res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; @@ -125,7 +126,7 @@ static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *dentry, if (!fh) return ERR_PTR(-ENOMEM); - res = ovl_do_getxattr(ofs, dentry, ox, fh->buf, res); + res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res); if (res < 0) goto fail; @@ -193,16 +194,17 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, return real; } -static bool ovl_is_opaquedir(struct super_block *sb, struct dentry *dentry) +static bool ovl_is_opaquedir(struct ovl_fs *ofs, struct path *path) { - return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_OPAQUE); + return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE); } -static struct dentry *ovl_lookup_positive_unlocked(const char *name, +static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, + const char *name, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_len_unlocked(name, base, len); + struct dentry *ret = lookup_one_unlocked(mnt_user_ns(d->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -224,10 +226,11 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret, bool drop_negative) { struct dentry *this; + struct path path; int err; bool last_element = !post[0]; - this = ovl_lookup_positive_unlocked(name, base, namelen, drop_negative); + this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -253,12 +256,15 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, d->stop = true; goto put_and_out; } + + path.dentry = this; + path.mnt = d->mnt; if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), this); + err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path); if (err < 0) goto out_err; @@ -278,14 +284,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, if (d->last) goto out; - if (ovl_is_opaquedir(d->sb, this)) { + if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) { d->stop = true; if (last_element) d->opaque = true; goto out; } } - err = ovl_check_redirect(this, d, prelen, post); + err = ovl_check_redirect(&path, d, prelen, post); if (err) goto out_err; out: @@ -464,7 +470,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, err = ovl_verify_fh(ofs, dentry, ox, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); if (err) goto fail; @@ -704,7 +710,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_one_positive_unlocked(ovl_upper_mnt_userns(ofs), name.name, + ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -856,6 +863,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { + d.mnt = ovl_upper_mnt(ofs); err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; @@ -911,6 +919,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, else d.last = lower.layer->idx == roe->numlower; + d.mnt = lower.layer->mnt; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; @@ -1071,14 +1080,18 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperdentry) ovl_dentry_set_upper_alias(dentry); else if (index) { - upperdentry = dget(index); - upperredirect = ovl_get_redirect_xattr(ofs, upperdentry, 0); + struct path upperpath = { + .dentry = upperdentry = dget(index), + .mnt = ovl_upper_mnt(ofs), + }; + + upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); upperredirect = NULL; goto out_free_oe; } - err = ovl_check_metacopy_xattr(ofs, upperdentry); + err = ovl_check_metacopy_xattr(ofs, &upperpath); if (err < 0) goto out_free_oe; uppermetacopy = err; @@ -1163,8 +1176,8 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_positive_unlocked(name->name, lowerdir, - name->len); + this = lookup_one_positive_unlocked(mnt_user_ns(poe->lowerstack[i].layer->mnt), + name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 2cd5741c873b..4f34b7e02eee 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/fs.h> +#include <linux/namei.h> #include "ovl_entry.h" #undef pr_fmt @@ -122,109 +123,180 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) return ovl_xattr_table[ox][ofs->config.userxattr]; } -static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +/* + * When changing ownership of an upper object map the intended ownership + * according to the upper layer's idmapping. When an upper mount idmaps files + * that are stored on-disk as owned by id 1001 to id 1000 this means stat on + * this object will report it as being owned by id 1000 when calling stat via + * the upper mount. + * In order to change ownership of an object so stat reports id 1000 when + * called on an idmapped upper mount the value written to disk - i.e., the + * value stored in ia_*id - must 1001. The mount mapping helper will thus take + * care to map 1000 to 1001. + * The mnt idmapping helpers are nops if the upper layer isn't idmapped. + */ +static inline int ovl_do_notify_change(struct ovl_fs *ofs, + struct dentry *upperdentry, + struct iattr *attr) +{ + struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); + struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); + + if (attr->ia_valid & ATTR_UID) + attr->ia_uid = mapped_kuid_user(upper_mnt_userns, + fs_userns, attr->ia_uid); + if (attr->ia_valid & ATTR_GID) + attr->ia_gid = mapped_kgid_user(upper_mnt_userns, + fs_userns, attr->ia_gid); + + return notify_change(upper_mnt_userns, upperdentry, attr, NULL); +} + +static inline int ovl_do_rmdir(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(&init_user_ns, dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_userns(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; } -static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, + struct dentry *dentry) { - int err = vfs_unlink(&init_user_ns, dir, dentry, NULL); + int err = vfs_unlink(ovl_upper_mnt_userns(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; } -static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) +static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, + struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL); + int err = vfs_link(old_dentry, ovl_upper_mnt_userns(ofs), dir, new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; } -static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_create(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(&init_user_ns, dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_userns(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } -static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_mkdir(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(&init_user_ns, dir, dentry, mode); + int err = vfs_mkdir(ovl_upper_mnt_userns(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } -static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_mknod(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_userns(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; } -static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, +static inline int ovl_do_symlink(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(&init_user_ns, dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_userns(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; } -static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, void *value, - size_t size) +static inline ssize_t ovl_do_getxattr(struct path *path, const char *name, + void *value, size_t size) { - const char *name = ovl_xattr(ofs, ox); - int err = vfs_getxattr(&init_user_ns, dentry, name, value, size); - int len = (value && err > 0) ? err : 0; + int err, len; + + WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); + + err = vfs_getxattr(mnt_user_ns(path->mnt), path->dentry, + name, value, size); + len = (value && err > 0) ? err : 0; pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", - dentry, name, min(len, 48), value, size, err); + path->dentry, name, min(len, 48), value, size, err); return err; } +static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, + struct dentry *upperdentry, + enum ovl_xattr ox, void *value, + size_t size) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + + return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size); +} + +static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, + struct path *path, + enum ovl_xattr ox, void *value, + size_t size) +{ + return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size); +} + static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, const void *value, - size_t size) + const char *name, const void *value, + size_t size, int flags) { - const char *name = ovl_xattr(ofs, ox); - int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0); - pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", - dentry, name, min((int)size, 48), value, size, err); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + + pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", + dentry, name, min((int)size, 48), value, size, flags, err); return err; } +static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const void *value, + size_t size) +{ + return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0); +} + static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox) + const char *name) { - const char *name = ovl_xattr(ofs, ox); - int err = vfs_removexattr(&init_user_ns, dentry, name); + int err = vfs_removexattr(ovl_upper_mnt_userns(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } -static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, - struct inode *newdir, struct dentry *newdentry, - unsigned int flags) +static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox) +{ + return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); +} + +static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, + struct dentry *olddentry, struct inode *newdir, + struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { - .old_mnt_userns = &init_user_ns, + .old_mnt_userns = ovl_upper_mnt_userns(ofs), .old_dir = olddir, .old_dentry = olddentry, - .new_mnt_userns = &init_user_ns, + .new_mnt_userns = ovl_upper_mnt_userns(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, @@ -239,22 +311,31 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, return err; } -static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +static inline int ovl_do_whiteout(struct ovl_fs *ofs, + struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(&init_user_ns, dir, dentry); + int err = vfs_whiteout(ovl_upper_mnt_userns(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } -static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) +static inline struct dentry *ovl_do_tmpfile(struct ovl_fs *ofs, + struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0); + struct dentry *ret = vfs_tmpfile(ovl_upper_mnt_userns(ofs), dentry, mode, 0); int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return ret; } +static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, + const char *name, + struct dentry *base, int len) +{ + return lookup_one(ovl_upper_mnt_userns(ofs), name, base, len); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) @@ -293,10 +374,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); void ovl_path_lowerdata(struct dentry *dentry, struct path *path); +void ovl_i_path_real(struct inode *inode, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); +const struct ovl_layer *ovl_i_layer_lower(struct inode *inode); const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); @@ -330,9 +414,20 @@ struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); -bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry); -bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, - enum ovl_xattr ox); +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, + enum ovl_xattr ox); +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path); + +static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, + struct dentry *upperdentry) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + return ovl_path_check_origin_xattr(ofs, &upperpath); +} + int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); @@ -344,10 +439,9 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path); bool ovl_is_metacopy_dentry(struct dentry *dentry); -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, - int padding); +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) @@ -366,9 +460,15 @@ static inline bool ovl_test_flag(unsigned long flag, struct inode *inode) } static inline bool ovl_is_impuredir(struct super_block *sb, - struct dentry *dentry) + struct dentry *upperdentry) { - return ovl_check_dir_xattr(sb, dentry, OVL_XATTR_IMPURE); + struct ovl_fs *ofs = OVL_FS(sb); + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + + return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); } /* @@ -461,12 +561,13 @@ static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, extern const struct file_operations ovl_dir_operations; struct file *ovl_dir_real_file(const struct file *file, bool want_upper); int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); -void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, + struct list_head *list); void ovl_cache_free(struct list_head *list); void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); -int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, - struct dentry *dentry, int level); +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, + struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct ovl_fs *ofs); /* @@ -520,16 +621,7 @@ bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir); struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir); struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip); -static inline void ovl_copyattr(struct inode *from, struct inode *to) -{ - to->i_uid = from->i_uid; - to->i_gid = from->i_gid; - to->i_mode = from->i_mode; - to->i_atime = from->i_atime; - to->i_mtime = from->i_mtime; - to->i_ctime = from->i_ctime; - i_size_write(to, i_size_read(from)); -} +void ovl_copyattr(struct inode *to); /* vfs inode flags copied from real to ovl inode */ #define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) @@ -570,12 +662,15 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) -int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, umode_t mode); -struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, +int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, + struct dentry **newdentry, umode_t mode); +struct dentry *ovl_create_real(struct ovl_fs *ofs, + struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr); +int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); +struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); -int ovl_cleanup(struct inode *dir, struct dentry *dentry); -struct dentry *ovl_lookup_temp(struct dentry *workdir); -struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; @@ -591,9 +686,8 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_maybe_copy_up(struct dentry *dentry, int flags); -int ovl_copy_xattr(struct super_block *sb, struct dentry *old, - struct dentry *new); -int ovl_set_attr(struct dentry *upper, struct kstat *stat); +int ovl_copy_xattr(struct super_block *sb, struct path *path, struct dentry *new); +int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 63efee554f69..e1af8f660698 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -90,6 +90,11 @@ static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) return ofs->layers[0].mnt; } +static inline struct user_namespace *ovl_upper_mnt_userns(struct ovl_fs *ofs) +{ + return mnt_user_ns(ovl_upper_mnt(ofs)); +} + static inline struct ovl_fs *OVL_FS(struct super_block *sb) { return (struct ovl_fs *)sb->s_fs_info; @@ -129,7 +134,7 @@ struct ovl_inode { unsigned long flags; struct inode vfs_inode; struct dentry *__upperdentry; - struct inode *lower; + struct ovl_path lowerpath; /* synchronize copy up and more */ struct mutex lock; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 150fdf3bc68d..78f62cc1797b 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -264,11 +264,11 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } -static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) +static int ovl_check_whiteouts(struct path *path, struct ovl_readdir_data *rdd) { int err; struct ovl_cache_entry *p; - struct dentry *dentry; + struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); @@ -278,7 +278,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one_len(p->name, dir, p->len); + dentry = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -312,7 +312,7 @@ static inline int ovl_dir_read(struct path *realpath, } while (!err && rdd->count); if (!err && rdd->first_maybe_whiteout && rdd->dentry) - err = ovl_check_whiteouts(realpath->dentry, rdd); + err = ovl_check_whiteouts(realpath, rdd); fput(realfile); @@ -479,7 +479,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) goto get; } } - this = lookup_one_len(p->name, dir, p->len); + this = lookup_one(mnt_user_ns(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; @@ -623,8 +623,8 @@ static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { - ovl_do_removexattr(ofs, ovl_dentry_upper(dentry), - OVL_XATTR_IMPURE); + ovl_removexattr(ofs, ovl_dentry_upper(dentry), + OVL_XATTR_IMPURE); ovl_drop_write(dentry); } ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); @@ -1001,7 +1001,8 @@ del_entry: return err; } -void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, + struct list_head *list) { struct ovl_cache_entry *p; @@ -1012,7 +1013,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; - dentry = lookup_one_len(p->name, upper, p->len); + dentry = ovl_lookup_upper(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, @@ -1020,7 +1021,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) continue; } if (dentry->d_inode) - ovl_cleanup(upper->d_inode, dentry); + ovl_cleanup(ofs, upper->d_inode, dentry); dput(dentry); } inode_unlock(upper->d_inode); @@ -1064,7 +1065,8 @@ int ovl_check_d_type_supported(struct path *realpath) #define OVL_INCOMPATDIR_NAME "incompat" -static int ovl_workdir_cleanup_recurse(struct path *path, int level) +static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, struct path *path, + int level) { int err; struct inode *dir = path->dentry->d_inode; @@ -1111,11 +1113,11 @@ static int ovl_workdir_cleanup_recurse(struct path *path, int level) err = -EINVAL; break; } - dentry = lookup_one_len(p->name, path->dentry, p->len); + dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) - err = ovl_workdir_cleanup(dir, path->mnt, dentry, level); + err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level); dput(dentry); if (err) break; @@ -1126,24 +1128,24 @@ out: return err; } -int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, - struct dentry *dentry, int level) +int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, + struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) { - return ovl_cleanup(dir, dentry); + return ovl_cleanup(ofs, dir, dentry); } - err = ovl_do_rmdir(dir, dentry); + err = ovl_do_rmdir(ofs, dir, dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; inode_unlock(dir); - err = ovl_workdir_cleanup_recurse(&path, level + 1); + err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); inode_lock_nested(dir, I_MUTEX_PARENT); if (!err) - err = ovl_cleanup(dir, dentry); + err = ovl_cleanup(ofs, dir, dentry); } return err; @@ -1179,7 +1181,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) if (p->len == 2 && p->name[1] == '.') continue; } - index = lookup_one_len(p->name, indexdir, p->len); + index = ovl_lookup_upper(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; @@ -1187,7 +1189,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { - err = ovl_workdir_cleanup(dir, path.mnt, index, 1); + err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1); if (err) break; goto next; @@ -1197,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ - err = ovl_cleanup(dir, index); + err = ovl_cleanup(ofs, dir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if @@ -1213,7 +1215,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) err = ovl_cleanup_and_whiteout(ofs, dir, index); } else { /* Cleanup orphan index entries */ - err = ovl_cleanup(dir, index); + err = ovl_cleanup(ofs, dir, index); } if (err) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 001cdbb8f015..e0a2e0468ee7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -184,7 +184,8 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) oi->version = 0; oi->flags = 0; oi->__upperdentry = NULL; - oi->lower = NULL; + oi->lowerpath.dentry = NULL; + oi->lowerpath.layer = NULL; oi->lowerdata = NULL; mutex_init(&oi->lock); @@ -205,7 +206,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - iput(oi->lower); + dput(oi->lowerpath.dentry); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else @@ -761,7 +762,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, inode_lock_nested(dir, I_MUTEX_PARENT); retry: - work = lookup_one_len(name, ofs->workbasedir, strlen(name)); + work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -778,7 +779,7 @@ retry: goto out_unlock; retried = true; - err = ovl_workdir_cleanup(dir, mnt, work, 0); + err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0); dput(work); if (err == -EINVAL) { work = ERR_PTR(err); @@ -787,7 +788,7 @@ retry: goto retry; } - err = ovl_mkdir_real(dir, &work, attr.ia_mode); + err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); if (err) goto out_dput; @@ -809,19 +810,19 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = vfs_removexattr(&init_user_ns, work, - XATTR_NAME_POSIX_ACL_DEFAULT); + err = ovl_do_removexattr(ofs, work, + XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = vfs_removexattr(&init_user_ns, work, - XATTR_NAME_POSIX_ACL_ACCESS); + err = ovl_do_removexattr(ofs, work, + XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ inode_lock(work->d_inode); - err = notify_change(&init_user_ns, work, &attr, NULL); + err = ovl_do_notify_change(ofs, work, &attr); inode_unlock(work->d_inode); if (err) goto out_dput; @@ -873,10 +874,6 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("filesystem on '%s' not supported\n", name); goto out_put; } - if (is_idmapped_mnt(path->mnt)) { - pr_err("idmapped layers are currently not supported\n"); - goto out_put; - } if (!d_is_dir(path->dentry)) { pr_err("'%s' not a directory\n", name); goto out_put; @@ -1256,8 +1253,9 @@ out: * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and * negative values if error is encountered. */ -static int ovl_check_rename_whiteout(struct dentry *workdir) +static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { + struct dentry *workdir = ofs->workdir; struct inode *dir = d_inode(workdir); struct dentry *temp; struct dentry *dest; @@ -1267,12 +1265,12 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) inode_lock_nested(dir, I_MUTEX_PARENT); - temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0)); + temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); err = PTR_ERR(temp); if (IS_ERR(temp)) goto out_unlock; - dest = ovl_lookup_temp(workdir); + dest = ovl_lookup_temp(ofs, workdir); err = PTR_ERR(dest); if (IS_ERR(dest)) { dput(temp); @@ -1281,14 +1279,14 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT); + err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT); if (err) { if (err == -EINVAL) err = 0; goto cleanup_temp; } - whiteout = lookup_one_len(name.name.name, workdir, name.name.len); + whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len); err = PTR_ERR(whiteout); if (IS_ERR(whiteout)) goto cleanup_temp; @@ -1297,11 +1295,11 @@ static int ovl_check_rename_whiteout(struct dentry *workdir) /* Best effort cleanup of whiteout and temp file */ if (err) - ovl_cleanup(dir, whiteout); + ovl_cleanup(ofs, dir, whiteout); dput(whiteout); cleanup_temp: - ovl_cleanup(dir, temp); + ovl_cleanup(ofs, dir, temp); release_dentry_name_snapshot(&name); dput(temp); dput(dest); @@ -1312,16 +1310,17 @@ out_unlock: return err; } -static struct dentry *ovl_lookup_or_create(struct dentry *parent, +static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, + struct dentry *parent, const char *name, umode_t mode) { size_t len = strlen(name); struct dentry *child; inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - child = lookup_one_len(name, parent, len); + child = ovl_lookup_upper(ofs, name, parent, len); if (!IS_ERR(child) && !child->d_inode) - child = ovl_create_real(parent->d_inode, child, + child = ovl_create_real(ofs, parent->d_inode, child, OVL_CATTR(mode)); inode_unlock(parent->d_inode); dput(parent); @@ -1343,7 +1342,7 @@ static int ovl_create_volatile_dirty(struct ovl_fs *ofs) const char *const *name = volatile_path; for (ctr = ARRAY_SIZE(volatile_path); ctr; ctr--, name++) { - d = ovl_lookup_or_create(d, *name, ctr > 1 ? S_IFDIR : S_IFREG); + d = ovl_lookup_or_create(ofs, d, *name, ctr > 1 ? S_IFDIR : S_IFREG); if (IS_ERR(d)) return PTR_ERR(d); } @@ -1391,7 +1390,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); + temp = ovl_do_tmpfile(ofs, ofs->workdir, S_IFREG | 0); ofs->tmpfile = !IS_ERR(temp); if (ofs->tmpfile) dput(temp); @@ -1400,7 +1399,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, /* Check if upper/work fs supports RENAME_WHITEOUT */ - err = ovl_check_rename_whiteout(ofs->workdir); + err = ovl_check_rename_whiteout(ofs); if (err < 0) goto out; @@ -1411,7 +1410,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, /* * Check if upper/work fs supports (trusted|user).overlay.* xattr */ - err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); + err = ovl_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1); if (err) { ofs->noxattr = true; if (ofs->config.index || ofs->config.metacopy) { @@ -1429,7 +1428,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, } err = 0; } else { - ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); + ovl_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE); } /* diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f48284a2a896..87f811c089e4 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -194,6 +194,20 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) return type; } +enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + WARN_ON_ONCE(d_is_dir(dentry)); + + if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type)) + ovl_path_lowerdata(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + struct dentry *ovl_dentry_upper(struct dentry *dentry) { return ovl_upperdentry_dereference(OVL_I(d_inode(dentry))); @@ -236,6 +250,17 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode) return ovl_upperdentry_dereference(OVL_I(inode)); } +void ovl_i_path_real(struct inode *inode, struct path *path) +{ + path->dentry = ovl_i_dentry_upper(inode); + if (!path->dentry) { + path->dentry = OVL_I(inode)->lowerpath.dentry; + path->mnt = OVL_I(inode)->lowerpath.layer->mnt; + } else { + path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb)); + } +} + struct inode *ovl_inode_upper(struct inode *inode) { struct dentry *upperdentry = ovl_i_dentry_upper(inode); @@ -245,7 +270,9 @@ struct inode *ovl_inode_upper(struct inode *inode) struct inode *ovl_inode_lower(struct inode *inode) { - return OVL_I(inode)->lower; + struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry; + + return lowerdentry ? d_inode(lowerdentry) : NULL; } struct inode *ovl_inode_real(struct inode *inode) @@ -443,7 +470,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity) void ovl_dir_modified(struct dentry *dentry, bool impurity) { /* Copy mtime/ctime */ - ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry)); + ovl_copyattr(d_inode(dentry)); ovl_dir_version_inc(dentry, impurity); } @@ -466,6 +493,7 @@ bool ovl_is_whiteout(struct dentry *dentry) struct file *ovl_path_open(struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); + struct user_namespace *real_mnt_userns = mnt_user_ns(path->mnt); int err, acc_mode; if (flags & ~(O_ACCMODE | O_LARGEFILE)) @@ -482,12 +510,12 @@ struct file *ovl_path_open(struct path *path, int flags) BUG(); } - err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN); + err = inode_permission(real_mnt_userns, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(&init_user_ns, inode)) + if (inode_owner_or_capable(real_mnt_userns, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); @@ -550,11 +578,11 @@ void ovl_copy_up_end(struct dentry *dentry) ovl_inode_unlock(d_inode(dentry)); } -bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry) +bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, struct path *path) { int res; - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_ORIGIN, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0); /* Zero size value means "copied up but origin unknown" */ if (res >= 0) @@ -563,16 +591,16 @@ bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry) return false; } -bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, - enum ovl_xattr ox) +bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, struct path *path, + enum ovl_xattr ox) { int res; char val; - if (!d_is_dir(dentry)) + if (!d_is_dir(path->dentry)) return false; - res = ovl_do_getxattr(OVL_FS(sb), dentry, ox, &val, 1); + res = ovl_path_getxattr(ofs, path, ox, &val, 1); if (res == 1 && val == 'y') return true; @@ -612,7 +640,7 @@ int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, if (ofs->noxattr) return xerr; - err = ovl_do_setxattr(ofs, upperdentry, ox, value, size); + err = ovl_setxattr(ofs, upperdentry, ox, value, size); if (err == -EOPNOTSUPP) { pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox)); @@ -652,8 +680,8 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper) char buf[OVL_PROTATTR_MAX+1]; int res, n; - res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, - OVL_PROTATTR_MAX); + res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf, + OVL_PROTATTR_MAX); if (res < 0) return; @@ -708,7 +736,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper, err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, len, -EPERM); } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) { - err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR); + err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR); if (err == -EOPNOTSUPP || err == -ENODATA) err = 0; } @@ -824,7 +852,7 @@ static void ovl_cleanup_index(struct dentry *dentry) } inode_lock_nested(dir, I_MUTEX_PARENT); - index = lookup_one_len(name.name, indexdir, name.len); + index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); err = PTR_ERR(index); if (IS_ERR(index)) { index = NULL; @@ -834,7 +862,7 @@ static void ovl_cleanup_index(struct dentry *dentry) dir, index); } else { /* Cleanup orphan index entries */ - err = ovl_cleanup(dir, index); + err = ovl_cleanup(ofs, dir, index); } inode_unlock(dir); @@ -943,15 +971,15 @@ err: } /* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry) +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct path *path) { int res; /* Only regular files can have metacopy xattr */ - if (!S_ISREG(d_inode(dentry)->i_mode)) + if (!S_ISREG(d_inode(path->dentry)->i_mode)) return 0; - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_METACOPY, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; @@ -987,13 +1015,12 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, - int padding) +char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct path *path, int padding) { int res; char *s, *next, *buf = NULL; - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; if (res < 0) @@ -1005,7 +1032,7 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, if (!buf) return ERR_PTR(-ENOMEM); - res = ovl_do_getxattr(ofs, dentry, OVL_XATTR_REDIRECT, buf, res); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res); if (res < 0) goto fail; if (res == 0) @@ -1060,3 +1087,33 @@ int ovl_sync_status(struct ovl_fs *ofs) return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); } + +/* + * ovl_copyattr() - copy inode attributes from layer to ovl inode + * + * When overlay copies inode information from an upper or lower layer to the + * relevant overlay inode it will apply the idmapping of the upper or lower + * layer when doing so ensuring that the ovl inode ownership will correctly + * reflect the ownership of the idmapped upper or lower layer. For example, an + * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to + * map any lower or upper inode owned by id 1001 to id 1000. These mapping + * helpers are nops when the relevant layer isn't idmapped. + */ +void ovl_copyattr(struct inode *inode) +{ + struct path realpath; + struct inode *realinode; + struct user_namespace *real_mnt_userns; + + ovl_i_path_real(inode, &realpath); + realinode = d_inode(realpath.dentry); + real_mnt_userns = mnt_user_ns(realpath.mnt); + + inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode); + inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode); + inode->i_mode = realinode->i_mode; + inode->i_atime = realinode->i_atime; + inode->i_mtime = realinode->i_mtime; + inode->i_ctime = realinode->i_ctime; + i_size_write(inode, i_size_read(realinode)); +} diff --git a/fs/read_write.c b/fs/read_write.c index e643aec2b0ef..b1b1cdfee9d3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -682,6 +682,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, return ksys_pread64(fd, buf, count, pos); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) +COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, + size_t, count, compat_arg_u64_dual(pos)) +{ + return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); +} +#endif + ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { @@ -708,6 +716,14 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, return ksys_pwrite64(fd, buf, count, pos); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) +COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, compat_arg_u64_dual(pos)) +{ + return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); +} +#endif + static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { diff --git a/fs/stat.c b/fs/stat.c index 5c2c94464e8b..9ced8860e0f3 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -659,7 +659,7 @@ SYSCALL_DEFINE5(statx, return ret; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT) static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; diff --git a/fs/sync.c b/fs/sync.c index c7690016453e..dc725914e1ed 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -373,6 +373,15 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, return ksys_sync_file_range(fd, offset, nbytes, flags); } +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE) +COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset), + compat_arg_u64_dual(nbytes), unsigned int, flags) +{ + return ksys_sync_file_range(fd, compat_arg_u64_glue(offset), + compat_arg_u64_glue(nbytes), flags); +} +#endif + /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index c0b84e960b20..e8b9b756f0ac 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -65,7 +65,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) */ static int run_gc(struct ubifs_info *c) { - int err, lnum; + int lnum; /* Make some free space by garbage-collecting dirty space */ down_read(&c->commit_sem); @@ -76,10 +76,7 @@ static int run_gc(struct ubifs_info *c) /* GC freed one LEB, return it to lprops */ dbg_budg("GC freed LEB %d", lnum); - err = ubifs_return_leb(c, lnum); - if (err) - return err; - return 0; + return ubifs_return_leb(c, lnum); } /** diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e4f193eae4b2..e4c4761aff7f 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -677,7 +677,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, int err; err = security_inode_init_security(inode, dentry, qstr, - &init_xattrs, 0); + &init_xattrs, NULL); if (err) { struct ubifs_info *c = dentry->i_sb->s_fs_info; ubifs_err(c, "cannot initialize security for inode %lu, error %d", diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1e4ee042d52f..3e920cf1b454 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -173,7 +173,6 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -192,7 +191,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 14ae0826bc15..836ab1b8ed7b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -25,10 +25,9 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_attr_item.h" -#include "xfs_log.h" +#include "xfs_xattr.h" -struct kmem_cache *xfs_attri_cache; -struct kmem_cache *xfs_attrd_cache; +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -58,11 +57,11 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, - struct xfs_da_state **state); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( @@ -377,7 +376,7 @@ xfs_attr_try_sf_addname( static int xfs_attr_sf_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -423,7 +422,7 @@ out: */ static enum xfs_delattr_state xfs_attr_complete_op( - struct xfs_attr_item *attr, + struct xfs_attr_intent *attr, enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; @@ -439,7 +438,7 @@ xfs_attr_complete_op( static int xfs_attr_leaf_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -493,7 +492,7 @@ out: */ static int xfs_attr_node_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -530,7 +529,7 @@ out: static int xfs_attr_rmtval_alloc( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error = 0; @@ -594,6 +593,19 @@ xfs_attr_leaf_mark_incomplete( return xfs_attr3_leaf_setflag(args); } +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); +} + /* * Initial setup for xfs_attr_node_removename. Make sure the attr is there and * the blocks are valid. Attr keys with remote blocks will be marked @@ -601,29 +613,33 @@ xfs_attr_leaf_mark_incomplete( */ static int xfs_attr_node_removename_setup( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state **state = &attr->xattri_da_state; + struct xfs_da_state *state; int error; - error = xfs_attr_node_hasname(args, state); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); if (error != -EEXIST) goto out; error = 0; - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_mark_incomplete(args, *state); + error = xfs_attr_leaf_mark_incomplete(args, state); if (error) goto out; if (args->rmtblkno > 0) error = xfs_attr_rmtval_invalidate(args); out: - if (error) - xfs_da_state_free(*state); + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } return error; } @@ -635,7 +651,7 @@ out: */ static int xfs_attr_leaf_remove_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -700,7 +716,7 @@ xfs_attr_leaf_shrink( */ int xfs_attr_set_iter( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error = 0; @@ -852,6 +868,7 @@ xfs_attr_lookup( { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; if (!xfs_inode_hasattr(dp)) @@ -869,19 +886,22 @@ xfs_attr_lookup( return error; } - return xfs_attr_node_hasname(args, NULL); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); + return error; } static int -xfs_attr_item_init( +xfs_attr_intent_init( struct xfs_da_args *args, unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_item **attr) /* new xfs_attr_item */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; - new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); new->xattri_op_flags = op_flags; new->xattri_da_args = args; @@ -894,10 +914,10 @@ static int xfs_attr_defer_add( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); if (error) return error; @@ -913,10 +933,10 @@ static int xfs_attr_defer_replace( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); if (error) return error; @@ -933,10 +953,10 @@ xfs_attr_defer_remove( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); if (error) return error; @@ -962,7 +982,6 @@ xfs_attr_set( int error, local; int rmt_blks = 0; unsigned int total; - int delayed = xfs_has_larp(mp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; @@ -1007,12 +1026,6 @@ xfs_attr_set( rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } - if (delayed) { - error = xfs_attr_use_log_assist(mp); - if (error) - return error; - } - /* * Root fork attributes can use reserved data blocks for this * operation if necessary @@ -1020,7 +1033,7 @@ xfs_attr_set( xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) - goto drop_incompat; + return error; if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, @@ -1080,9 +1093,6 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); -drop_incompat: - if (delayed) - xlog_drop_incompat_feat(mp->m_log); return error; out_trans_cancel: @@ -1091,40 +1101,6 @@ out_trans_cancel: goto out_unlock; } -int __init -xfs_attri_init_cache(void) -{ - xfs_attri_cache = kmem_cache_create("xfs_attri", - sizeof(struct xfs_attri_log_item), - 0, 0, NULL); - - return xfs_attri_cache != NULL ? 0 : -ENOMEM; -} - -void -xfs_attri_destroy_cache(void) -{ - kmem_cache_destroy(xfs_attri_cache); - xfs_attri_cache = NULL; -} - -int __init -xfs_attrd_init_cache(void) -{ - xfs_attrd_cache = kmem_cache_create("xfs_attrd", - sizeof(struct xfs_attrd_log_item), - 0, 0, NULL); - - return xfs_attrd_cache != NULL ? 0 : -ENOMEM; -} - -void -xfs_attrd_destroy_cache(void) -{ - kmem_cache_destroy(xfs_attrd_cache); - xfs_attrd_cache = NULL; -} - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -1384,32 +1360,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Return EEXIST if attr is found, or ENOATTR if not - * statep: If not null is set to point at the found state. Caller will - * be responsible for freeing the state in this case. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_hasname( +xfs_attr_node_lookup( struct xfs_da_args *args, - struct xfs_da_state **statep) + struct xfs_da_state *state) { - struct xfs_da_state *state; int retval, error; - state = xfs_da_state_alloc(args); - if (statep != NULL) - *statep = state; - /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); if (error) - retval = error; - - if (!statep) - xfs_da_state_free(state); + return error; return retval; } @@ -1420,7 +1384,7 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -1429,7 +1393,8 @@ xfs_attr_node_addname_find_attr( * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_attr_node_hasname(args, &attr->xattri_da_state); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); switch (error) { case -ENOATTR: if (args->op_flags & XFS_DA_OP_REPLACE) @@ -1456,8 +1421,10 @@ xfs_attr_node_addname_find_attr( return 0; error: - if (attr->xattri_da_state) + if (attr->xattri_da_state) { xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } return error; } @@ -1470,7 +1437,7 @@ error: */ static int xfs_attr_node_try_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = attr->xattri_da_state; @@ -1511,6 +1478,7 @@ xfs_attr_node_try_addname( out: xfs_da_state_free(state); + attr->xattri_da_state = NULL; return error; } @@ -1535,10 +1503,10 @@ xfs_attr_node_removename( static int xfs_attr_node_remove_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state *state = NULL; + struct xfs_da_state *state = xfs_da_state_alloc(args); int retval = 0; int error = 0; @@ -1548,8 +1516,6 @@ xfs_attr_node_remove_attr( * attribute entry after any split ops. */ args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1567,8 +1533,7 @@ xfs_attr_node_remove_attr( retval = error = 0; out: - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); if (error) return error; return retval; @@ -1597,7 +1562,8 @@ xfs_attr_node_get( /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_attr_node_hasname(args, &state); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); if (error != -EEXIST) goto out_release; @@ -1616,8 +1582,7 @@ out_release: state->path.blk[i].bp = NULL; } - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } @@ -1637,3 +1602,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 1af7abe29eef..e329da3e7afa 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -31,7 +31,8 @@ struct xfs_attr_list_context; static inline bool xfs_has_larp(struct xfs_mount *mp) { #ifdef DEBUG - return xfs_globals.larp; + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; #else return false; #endif @@ -434,7 +435,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_attr_item.xattri_da_state + * Enum values for xfs_attr_intent.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -501,44 +502,46 @@ enum xfs_delattr_state { { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ { XFS_DAS_DONE, "XFS_DAS_DONE" } -/* - * Defines for xfs_attr_item.xattri_flags - */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/ +struct xfs_attri_log_nameval; /* * Context used for keeping track of delayed attribute operations */ -struct xfs_attr_item { +struct xfs_attr_intent { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; + + /* Used in xfs_attr_node_removename to roll through removing blocks */ + struct xfs_da_state *xattri_da_state; + struct xfs_da_args *xattri_da_args; /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; + + /* * Used by xfs_attr_set to hold a leaf buffer across a transaction roll */ struct xfs_buf *xattri_leaf_bp; - /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec xattri_map; - xfs_dablk_t xattri_lblkno; - int xattri_blkcnt; - - /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *xattri_da_state; - /* Used to keep track of current state of delayed operation */ - unsigned int xattri_flags; enum xfs_delattr_state xattri_dela_state; /* - * Attr operation being performed - XFS_ATTR_OP_FLAGS_* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* */ unsigned int xattri_op_flags; - /* - * used to log this item to an intent containing a list of attrs to - * commit later - */ - struct list_head xattri_list; + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; }; @@ -557,21 +560,13 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_iter(struct xfs_attr_item *attr); -int xfs_attr_remove_iter(struct xfs_attr_item *attr); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); -extern struct kmem_cache *xfs_attri_cache; -extern struct kmem_cache *xfs_attrd_cache; - -int __init xfs_attri_init_cache(void); -void xfs_attri_destroy_cache(void); -int __init xfs_attrd_init_cache(void); -void xfs_attrd_destroy_cache(void); - /* * Check to see if the attr should be upgraded from non-existent or shortform to * single-leaf-block attribute list. @@ -634,4 +629,8 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4250159ecced..7298c148f848 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,7 +568,7 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_bmbt_irec *map = &attr->xattri_map; @@ -598,7 +598,7 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -674,7 +674,7 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error, done; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 62b398edec3f..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_attr_item *attr); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr); -int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2aa300f7461f..2eecc49fc1b2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -51,16 +51,31 @@ xfs_btree_magic( return magic; } -static xfs_failaddr_t +/* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t xfs_btree_check_lblock_siblings( struct xfs_mount *mp, struct xfs_btree_cur *cur, int level, xfs_fsblock_t fsb, - xfs_fsblock_t sibling) + __be64 dsibling) { - if (sibling == NULLFSBLOCK) + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; + + sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; if (level >= 0) { @@ -74,17 +89,21 @@ xfs_btree_check_lblock_siblings( return NULL; } -static xfs_failaddr_t +static inline xfs_failaddr_t xfs_btree_check_sblock_siblings( struct xfs_mount *mp, struct xfs_btree_cur *cur, int level, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_agblock_t sibling) + __be32 dsibling) { - if (sibling == NULLAGBLOCK) + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) return NULL; + + sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; if (level >= 0) { @@ -136,10 +155,10 @@ __xfs_btree_check_lblock( fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - be64_to_cpu(block->bb_u.l.bb_leftsib)); + block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - be64_to_cpu(block->bb_u.l.bb_rightsib)); + block->bb_u.l.bb_rightsib); return fa; } @@ -204,10 +223,10 @@ __xfs_btree_check_sblock( } fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_leftsib)); + block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, - agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); + agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -426,8 +445,14 @@ xfs_btree_del_cursor( break; } + /* + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. + */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - xfs_is_shutdown(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -3247,7 +3272,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3327,7 +3352,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3347,7 +3372,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3432,6 +3457,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -4523,10 +4550,10 @@ xfs_btree_lblock_verify( /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - be64_to_cpu(block->bb_u.l.bb_leftsib)); + block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - be64_to_cpu(block->bb_u.l.bb_rightsib)); + block->bb_u.l.bb_rightsib); return fa; } @@ -4580,10 +4607,10 @@ xfs_btree_sblock_verify( agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_leftsib)); + block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_rightsib)); + block->bb_u.s.bb_rightsib); return fa; } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index aa74f3fdb571..e7201dc68f43 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -117,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_cache_free(xfs_da_state_cache, state); } +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; +} + static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) { if (whichfork == XFS_DATA_FORK) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ed2303e4d46a..d33b7686a0b3 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -225,6 +225,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ceb222b4f261..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -191,35 +191,56 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static bool +/* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; + + if (dfp->dfp_intent) + return 1; - if (!dfp->dfp_intent) - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); - return dfp->dfp_intent != NULL; + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -static bool +static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; - bool ret = false; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - ret |= xfs_defer_create_intent(tp, dfp, true); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } return ret; } @@ -457,6 +478,8 @@ xfs_defer_finish_one( dfp->dfp_count--; error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { + int ret; + /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to @@ -467,7 +490,9 @@ xfs_defer_finish_one( dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; - xfs_defer_create_intent(tp, dfp, false); + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; } if (error) @@ -514,10 +539,14 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - bool has_intents = xfs_defer_create_intents(*tp); + int has_intents = xfs_defer_create_intents(*tp); list_splice_init(&(*tp)->t_dfops, &dop_pending); + if (has_intents < 0) { + error = has_intents; + goto out_shutdown; + } if (has_intents || dfp) { error = xfs_defer_trans_roll(tp); if (error) @@ -676,13 +705,15 @@ xfs_defer_ops_capture( if (list_empty(&tp->t_dfops)) return NULL; + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + /* Create an object to capture the defer ops. */ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); - xfs_defer_create_intents(tp); - /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; @@ -759,6 +790,10 @@ xfs_defer_ops_capture_and_commit( /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } if (!dfc) return xfs_trans_commit(tp); @@ -873,10 +908,7 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - error = xfs_attri_init_cache(); - if (error) - goto err; - error = xfs_attrd_init_cache(); + error = xfs_attr_intent_init_cache(); if (error) goto err; return 0; @@ -889,8 +921,7 @@ err: void xfs_defer_destroy_item_caches(void) { - xfs_attri_destroy_cache(); - xfs_attrd_destroy_cache(); + xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f7edd1ecf6d9..b351b9dc6561 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -906,10 +906,18 @@ struct xfs_icreate_log { * Flags for deferred attribute operations. * Upper bits are flags, lower byte is type code */ -#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */ -#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */ -#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */ -#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) /* * This is the structure used to lay out an attr log item in the @@ -924,7 +932,7 @@ struct xfs_attri_log_format { uint32_t alfi_op_flags; /* marks the op as a set or remove */ uint32_t alfi_name_len; /* attr name length */ uint32_t alfi_value_len; /* attr value length */ - uint32_t alfi_attr_flags;/* attr flags */ + uint32_t alfi_attr_filter;/* attr filter flags */ }; struct xfs_attrd_log_format { diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 32e216255cb0..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -110,12 +110,6 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -128,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +int xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f0b38f4aba80..8b9bd178a487 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -213,7 +213,7 @@ xfs_symlink_shortform_verify( /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index b11870d07c56..2e8e400f10a9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -340,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, }; -/* This isn't a stable feature, warn once per day. */ -static inline void -xchk_experimental_warning( - struct xfs_mount *mp) -{ - static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( - "xchk_warning", 86400 * HZ, 1); - ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&scrub_warning)) - xfs_alert(mp, -"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); -} - static int xchk_validate_inputs( struct xfs_mount *mp, @@ -478,7 +464,8 @@ xfs_scrub_metadata( if (error) goto out; - xchk_experimental_warning(mp); + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, + "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); if (!sc) { diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 3df9c1782ead..b744c62052b6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -17,6 +17,7 @@ #include "xfs_error.h" #include "xfs_acl.h" #include "xfs_trans.h" +#include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> @@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) xfs_acl_to_disk(args.value, acl); } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); kmem_free(args.value); /* diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e8ac88d9fd14..4a28c2d77070 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -22,13 +22,15 @@ #include "xfs_attr.h" #include "xfs_attr_item.h" #include "xfs_trace.h" -#include "xfs_inode.h" #include "xfs_trans_space.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; + static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, @@ -39,12 +41,80 @@ static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_attri_log_item, attri_item); } +/* + * Shared xattr name/value buffers for logged extended attribute operations + * + * When logging updates to extended attributes, we can create quite a few + * attribute log intent items for a single xattr update. To avoid cycling the + * memory allocator and memcpy overhead, the name (and value, for setxattr) + * are kept in a refcounted object that is shared across all related log items + * and the upper-level deferred work state structure. The shared buffer has + * a control structure, followed by the name, and then the value. + */ + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_get( + struct xfs_attri_log_nameval *nv) +{ + if (!refcount_inc_not_zero(&nv->refcount)) + return NULL; + return nv; +} + +static inline void +xfs_attri_log_nameval_put( + struct xfs_attri_log_nameval *nv) +{ + if (!nv) + return; + if (refcount_dec_and_test(&nv->refcount)) + kvfree(nv); +} + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_alloc( + const void *name, + unsigned int name_len, + const void *value, + unsigned int value_len) +{ + struct xfs_attri_log_nameval *nv; + + /* + * This could be over 64kB in length, so we have to use kvmalloc() for + * this. But kvmalloc() utterly sucks, so we use our own version. + */ + nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + + name_len + value_len); + if (!nv) + return nv; + + nv->name.i_addr = nv + 1; + nv->name.i_len = name_len; + nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; + memcpy(nv->name.i_addr, name, name_len); + + if (value_len) { + nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_len = value_len; + memcpy(nv->value.i_addr, value, value_len); + } else { + nv->value.i_addr = NULL; + nv->value.i_len = 0; + } + nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + + refcount_set(&nv->refcount, 1); + return nv; +} + STATIC void xfs_attri_item_free( struct xfs_attri_log_item *attrip) { kmem_free(attrip->attri_item.li_lv_shadow); - kvfree(attrip); + xfs_attri_log_nameval_put(attrip->attri_nameval); + kmem_cache_free(xfs_attri_cache, attrip); } /* @@ -73,16 +143,17 @@ xfs_attri_item_size( int *nbytes) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; *nvecs += 2; *nbytes += sizeof(struct xfs_attri_log_format) + - xlog_calc_iovec_len(attrip->attri_name_len); + xlog_calc_iovec_len(nv->name.i_len); - if (!attrip->attri_value_len) + if (!nv->value.i_len) return; *nvecs += 1; - *nbytes += xlog_calc_iovec_len(attrip->attri_value_len); + *nbytes += xlog_calc_iovec_len(nv->value.i_len); } /* @@ -97,6 +168,7 @@ xfs_attri_item_format( { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_log_iovec *vecp = NULL; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; attrip->attri_format.alfi_type = XFS_LI_ATTRI; attrip->attri_format.alfi_size = 1; @@ -108,22 +180,18 @@ xfs_attri_item_format( * the log recovery. */ - ASSERT(attrip->attri_name_len > 0); + ASSERT(nv->name.i_len > 0); attrip->attri_format.alfi_size++; - if (attrip->attri_value_len > 0) + if (nv->value.i_len > 0) attrip->attri_format.alfi_size++; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, - attrip->attri_name, - attrip->attri_name_len); - if (attrip->attri_value_len > 0) - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, - attrip->attri_value, - attrip->attri_value_len); + xlog_copy_from_iovec(lv, &vecp, &nv->name); + if (nv->value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->value); } /* @@ -158,41 +226,18 @@ xfs_attri_item_release( STATIC struct xfs_attri_log_item * xfs_attri_init( struct xfs_mount *mp, - uint32_t name_len, - uint32_t value_len) - + struct xfs_attri_log_nameval *nv) { struct xfs_attri_log_item *attrip; - uint32_t buffer_size = name_len + value_len; - if (buffer_size) { - /* - * This could be over 64kB in length, so we have to use - * kvmalloc() for this. But kvmalloc() utterly sucks, so we - * use own version. - */ - attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) + - buffer_size); - } else { - attrip = kmem_cache_alloc(xfs_attri_cache, - GFP_NOFS | __GFP_NOFAIL); - } - memset(attrip, 0, sizeof(struct xfs_attri_log_item)); + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); - attrip->attri_name_len = name_len; - if (name_len) - attrip->attri_name = ((char *)attrip) + - sizeof(struct xfs_attri_log_item); - else - attrip->attri_name = NULL; - - attrip->attri_value_len = value_len; - if (value_len) - attrip->attri_value = ((char *)attrip) + - sizeof(struct xfs_attri_log_item) + - name_len; - else - attrip->attri_value = NULL; + /* + * Grab an extra reference to the name/value buffer for this log item. + * The caller retains its own reference! + */ + attrip->attri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attrip->attri_nameval); xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, &xfs_attri_item_ops); @@ -233,7 +278,7 @@ STATIC void xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) { kmem_free(attrdp->attrd_item.li_lv_shadow); - kmem_free(attrdp); + kmem_cache_free(xfs_attrd_cache, attrdp); } STATIC void @@ -297,7 +342,7 @@ xfs_attrd_item_intent( */ STATIC int xfs_xattri_finish_update( - struct xfs_attr_item *attr, + struct xfs_attr_intent *attr, struct xfs_attrd_log_item *attrdp) { struct xfs_da_args *args = attr->xattri_da_args; @@ -335,7 +380,7 @@ STATIC void xfs_attr_log_item( struct xfs_trans *tp, struct xfs_attri_log_item *attrip, - struct xfs_attr_item *attr) + const struct xfs_attr_intent *attr) { struct xfs_attri_log_format *attrp; @@ -343,23 +388,18 @@ xfs_attr_log_item( set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); /* - * At this point the xfs_attr_item has been constructed, and we've + * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format - * structure with fields from this xfs_attr_item + * structure with fields from this xfs_attr_intent */ attrp = &attrip->attri_format; attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = attr->xattri_da_args->valuelen; - attrp->alfi_name_len = attr->xattri_da_args->namelen; - attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter; - - memcpy(attrip->attri_name, attr->xattri_da_args->name, - attr->xattri_da_args->namelen); - memcpy(attrip->attri_value, attr->xattri_da_args->value, - attr->xattri_da_args->valuelen); - attrip->attri_name_len = attr->xattri_da_args->namelen; - attrip->attri_value_len = attr->xattri_da_args->valuelen; + attrp->alfi_value_len = attr->xattri_nameval->value.i_len; + attrp->alfi_name_len = attr->xattri_nameval->name.i_len; + ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; } /* Get an ATTRI. */ @@ -372,7 +412,7 @@ xfs_attr_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_attri_log_item *attrip; - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; ASSERT(count == 1); @@ -383,19 +423,47 @@ xfs_attr_create_intent( * Each attr item only performs one attribute operation at a time, so * this is a list of one */ - list_for_each_entry(attr, items, xattri_list) { - attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen, - attr->xattri_da_args->valuelen); - if (attrip == NULL) - return NULL; - - xfs_trans_add_item(tp, &attrip->attri_item); - xfs_attr_log_item(tp, attrip, attr); + attr = list_first_entry_or_null(items, struct xfs_attr_intent, + xattri_list); + + /* + * Create a buffer to store the attribute name and value. This buffer + * will be shared between the higher level deferred xattr work state + * and the lower level xattr log items. + */ + if (!attr->xattri_nameval) { + struct xfs_da_args *args = attr->xattri_da_args; + + /* + * Transfer our reference to the name/value buffer to the + * deferred work state structure. + */ + attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, + args->namelen, args->value, args->valuelen); } + if (!attr->xattri_nameval) + return ERR_PTR(-ENOMEM); + + attrip = xfs_attri_init(mp, attr->xattri_nameval); + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; } +static inline void +xfs_attr_free_item( + struct xfs_attr_intent *attr) +{ + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + xfs_attri_log_nameval_put(attr->xattri_nameval); + if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) + kmem_free(attr); + else + kmem_cache_free(xfs_attr_intent_cache, attr); +} + /* Process an attr. */ STATIC int xfs_attr_finish_item( @@ -404,11 +472,11 @@ xfs_attr_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; struct xfs_attrd_log_item *done_item = NULL; int error; - attr = container_of(item, struct xfs_attr_item, xattri_list); + attr = container_of(item, struct xfs_attr_intent, xattri_list); if (done) done_item = ATTRD_ITEM(done); @@ -420,7 +488,7 @@ xfs_attr_finish_item( error = xfs_xattri_finish_update(attr, done_item); if (error != -EAGAIN) - kmem_free(attr); + xfs_attr_free_item(attr); return error; } @@ -438,33 +506,10 @@ STATIC void xfs_attr_cancel_item( struct list_head *item) { - struct xfs_attr_item *attr; - - attr = container_of(item, struct xfs_attr_item, xattri_list); - kmem_free(attr); -} - -STATIC xfs_lsn_t -xfs_attri_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); - - /* - * The attrip refers to xfs_attr_item memory to log the name and value - * with the intent item. This already occurred when the intent was - * committed so these fields are no longer accessed. Clear them out of - * caution since we're about to free the xfs_attr_item. - */ - attrip->attri_name = NULL; - attrip->attri_value = NULL; + struct xfs_attr_intent *attr; - /* - * The ATTRI is logged only once and cannot be moved in the log, so - * simply return the lsn at which it's been logged. - */ - return lsn; + attr = container_of(item, struct xfs_attr_intent, xattri_list); + xfs_attr_free_item(attr); } STATIC bool @@ -482,16 +527,22 @@ xfs_attri_validate( struct xfs_attri_log_format *attrp) { unsigned int op = attrp->alfi_op_flags & - XFS_ATTR_OP_FLAGS_TYPE_MASK; + XFS_ATTRI_OP_FLAGS_TYPE_MASK; if (attrp->__pad != 0) return false; + if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) + return false; + + if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) + return false; + /* alfi_op_flags should be either a set or remove */ switch (op) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: - case XFS_ATTR_OP_FLAGS_REMOVE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + case XFS_ATTRI_OP_FLAGS_REMOVE: break; default: return false; @@ -517,13 +568,14 @@ xfs_attri_item_recover( struct list_head *capture_list) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; struct xfs_trans_res tres; struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error, ret = 0; int total; int local; @@ -535,41 +587,50 @@ xfs_attri_item_recover( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) return -EFSCORRUPTED; error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); if (error) return error; - attr = kmem_zalloc(sizeof(struct xfs_attr_item) + + attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; - attr->xattri_op_flags = attrp->alfi_op_flags; + attr->xattri_op_flags = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + + /* + * We're reconstructing the deferred work state structure from the + * recovered log item. Grab a reference to the name/value buffer and + * attach it to the new work state. + */ + attr->xattri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attr->xattri_nameval); args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->name = attrip->attri_name; - args->namelen = attrp->alfi_name_len; + args->name = nv->name.i_addr; + args->namelen = nv->name.i_len; args->hashval = xfs_da_hashname(args->name, args->namelen); - args->attr_filter = attrp->alfi_attr_flags; + args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; - switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: - args->value = attrip->attri_value; - args->valuelen = attrp->alfi_value_len; + switch (attr->xattri_op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; args->total = xfs_attr_calc_size(args, &local); if (xfs_inode_hasattr(args->dp)) attr->xattri_dela_state = xfs_attr_init_replace_state(args); else attr->xattri_dela_state = xfs_attr_init_add_state(args); break; - case XFS_ATTR_OP_FLAGS_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: if (!xfs_inode_hasattr(args->dp)) goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); @@ -613,7 +674,7 @@ out_unlock: xfs_irele(ip); out: if (ret != -EAGAIN) - kmem_free(attr); + xfs_attr_free_item(attr); return error; } @@ -636,22 +697,18 @@ xfs_attri_item_relog( attrdp = xfs_trans_get_attrd(tp, old_attrip); set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len, - old_attrp->alfi_value_len); + /* + * Create a new log item that shares the same name/value buffer as the + * old log item. + */ + new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval); new_attrp = &new_attrip->attri_format; new_attrp->alfi_ino = old_attrp->alfi_ino; new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; new_attrp->alfi_value_len = old_attrp->alfi_value_len; new_attrp->alfi_name_len = old_attrp->alfi_name_len; - new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags; - - memcpy(new_attrip->attri_name, old_attrip->attri_name, - new_attrip->attri_name_len); - - if (new_attrip->attri_value_len > 0) - memcpy(new_attrip->attri_value, old_attrip->attri_value, - new_attrip->attri_value_len); + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; xfs_trans_add_item(tp, &new_attrip->attri_item); set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); @@ -666,46 +723,46 @@ xlog_recover_attri_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; - int region = 0; + struct xfs_attri_log_nameval *nv; + const void *attr_value = NULL; + const void *attr_name; + int error; - attri_formatp = item->ri_buf[region].i_addr; + attri_formatp = item->ri_buf[0].i_addr; + attr_name = item->ri_buf[1].i_addr; - /* Validate xfs_attri_log_format */ + /* Validate xfs_attri_log_format before the large memory allocation */ if (!xfs_attri_validate(mp, attri_formatp)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } - /* memory alloc failure will cause replay to abort */ - attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len, - attri_formatp->alfi_value_len); - if (attrip == NULL) - return -ENOMEM; + if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } - error = xfs_attri_copy_format(&item->ri_buf[region], - &attrip->attri_format); - if (error) - goto out; + if (attri_formatp->alfi_value_len) + attr_value = item->ri_buf[2].i_addr; - region++; - memcpy(attrip->attri_name, item->ri_buf[region].i_addr, - attrip->attri_name_len); + /* + * Memory alloc failure will cause replay to abort. We attach the + * name/value buffer to the recovered incore log item and drop our + * reference. + */ + nv = xfs_attri_log_nameval_alloc(attr_name, + attri_formatp->alfi_name_len, attr_value, + attri_formatp->alfi_value_len); + if (!nv) + return -ENOMEM; - if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; + attrip = xfs_attri_init(mp, nv); + error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); + if (error) goto out; - } - - if (attrip->attri_value_len > 0) { - region++; - memcpy(attrip->attri_value, item->ri_buf[region].i_addr, - attrip->attri_value_len); - } /* * The ATTRI has two references. One for the ATTRD and one for ATTRI to @@ -715,9 +772,11 @@ xlog_recover_attri_commit_pass2( */ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); xfs_attri_release(attrip); + xfs_attri_log_nameval_put(nv); return 0; out: xfs_attri_item_free(attrip); + xfs_attri_log_nameval_put(nv); return error; } @@ -797,7 +856,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_size = xfs_attri_item_size, .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, - .iop_committed = xfs_attri_item_committed, .iop_release = xfs_attri_item_release, .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index c3b779f82adb..3280a7930287 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -11,25 +11,30 @@ struct xfs_mount; struct kmem_zone; +struct xfs_attri_log_nameval { + struct xfs_log_iovec name; + struct xfs_log_iovec value; + refcount_t refcount; + + /* name and value follow the end of this struct */ +}; + /* * This is the "attr intention" log item. It is used to log the fact that some * extended attribute operations need to be processed. An operation is * currently either a set or remove. Set or remove operations are described by - * the xfs_attr_item which may be logged to this intent. + * the xfs_attr_intent which may be logged to this intent. * * During a normal attr operation, name and value point to the name and value * fields of the caller's xfs_da_args structure. During a recovery, the name * and value buffers are copied from the log, and stored in a trailing buffer - * attached to the xfs_attr_item until they are committed. They are freed when - * the xfs_attr_item itself is freed when the work is done. + * attached to the xfs_attr_intent until they are committed. They are freed + * when the xfs_attr_intent itself is freed when the work is done. */ struct xfs_attri_log_item { struct xfs_log_item attri_item; atomic_t attri_refcount; - int attri_name_len; - int attri_value_len; - void *attri_name; - void *attri_value; + struct xfs_attri_log_nameval *attri_nameval; struct xfs_attri_log_format attri_format; }; @@ -43,4 +48,7 @@ struct xfs_attrd_log_item { struct xfs_attrd_log_format attrd_format; }; +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + #endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index e484251dc9c8..ffa94102094d 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -24,6 +24,15 @@ #include "xfs_quota.h" /* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +/* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. */ @@ -993,3 +1002,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +int +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + void *p; + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + int i; + + if (!log->l_buf_cancel_table) + return; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a60632ecc3f0..5a171c0b244b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -576,9 +576,9 @@ xfs_file_dio_write_unaligned( * don't even bother trying the fast path in this case. */ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { -retry_exclusive: if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; +retry_exclusive: iolock = XFS_IOLOCK_EXCL; flags = IOMAP_DIO_FORCE_WAIT; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 888839e75d11..d4a77c53f94b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -149,12 +149,7 @@ xfs_growfs_data_private( error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, &lastag_extended); } else { - static struct ratelimit_state shrink_warning = \ - RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); - ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&shrink_warning)) - xfs_alert(mp, + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b2879870a17e..52d6f2c7d58b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2622,7 +2622,7 @@ xfs_ifree( */ error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - return error; + goto out; error = xfs_iunlink_remove(tp, pag, ip); if (error) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e5cb7936206..5a364a7d58fd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -37,6 +37,7 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include <linux/mount.h> #include <linux/namei.h> @@ -524,7 +525,7 @@ xfs_attrmulti_attr_set( args.valuelen = len; } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (flags & XFS_IOC_ATTR_ROOT)) xfs_forget_acl(inode, name); kfree(args.value); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e912b7fee714..29f5b8b8aca6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -24,6 +24,7 @@ #include "xfs_iomap.h" #include "xfs_error.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -61,7 +62,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (error < 0) break; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9dc748abdf33..1e972f884a81 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3877,44 +3877,3 @@ xlog_drop_incompat_feat( { up_read(&log->l_incompat_users); } - -/* - * Get permission to use log-assisted atomic exchange of file extents. - * - * Callers must not be running any transactions or hold any inode locks, and - * they must release the permission by calling xlog_drop_incompat_feat - * when they're done. - */ -int -xfs_attr_use_log_assist( - struct xfs_mount *mp) -{ - int error = 0; - - /* - * Protect ourselves from an idle log clearing the logged xattrs log - * incompat feature bit. - */ - xlog_use_incompat_feat(mp->m_log); - - /* - * If log-assisted xattrs are already enabled, the caller can use the - * log assisted swap functions with the log-incompat reference we got. - */ - if (xfs_sb_version_haslogxattrs(&mp->m_sb)) - return 0; - - /* Enable log-assisted xattrs. */ - error = xfs_add_incompat_log_feature(mp, - XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); - if (error) - goto drop_incompat; - - xfs_warn_once(mp, -"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!"); - - return 0; -drop_incompat: - xlog_drop_incompat_feat(mp->m_log); - return error; -} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 252b098cde1f..f3ce046a7d45 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -86,6 +86,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } +static inline void * +xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + const struct xfs_log_iovec *src) +{ + return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); +} + /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 67fd9789e69a..686c01eb3661 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -428,9 +428,6 @@ struct xlog { struct rw_semaphore l_incompat_users; }; -#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 97b941c07957..5f7e4e6e33ce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -39,13 +39,6 @@ STATIC int xlog_clear_stale_blocks( struct xlog *, xfs_lsn_t); -#if defined(DEBUG) -STATIC void -xlog_recover_check_summary( - struct xlog *); -#else -#define xlog_recover_check_summary(log) -#endif STATIC int xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); @@ -3230,7 +3223,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error; ASSERT(head_blk != tail_blk); @@ -3238,37 +3231,25 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error; error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; } @@ -3339,8 +3320,6 @@ xlog_do_recover( } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - xlog_recover_check_summary(log); - /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; @@ -3483,7 +3462,6 @@ xlog_recover_finish( } xlog_recover_process_iunlinks(log); - xlog_recover_check_summary(log); /* * Recover any CoW staging blocks that are still referenced by the @@ -3517,52 +3495,3 @@ xlog_recover_cancel( xlog_recover_cancel_intents(log); } -#if defined(DEBUG) -/* - * Read all of the agf and agi counters and check that they - * are consistent with the superblock counters. - */ -STATIC void -xlog_recover_check_summary( - struct xlog *log) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; - - freeblks = 0LL; - itotal = 0LL; - ifree = 0LL; - for_each_perag(mp, agno, pag) { - error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); - if (error) { - xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agf *agfp = agfbp->b_addr; - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - } - - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); - if (error) { - xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agi *agi = agibp->b_addr; - - itotal += be32_to_cpu(agi->agi_count); - ifree += be32_to_cpu(agi->agi_freecount); - xfs_buf_relse(agibp); - } - } -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 55ee464ab59f..cc323775a12c 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -75,6 +75,12 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_mount(mp, warntag, fmt, ...) \ +do { \ + if (xfs_should_warn((mp), (warntag))) \ + xfs_warn((mp), (fmt), ##__VA_ARGS__); \ +} while (0) + #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0c0bcbd4949d..daa8d29c46b4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1356,7 +1356,6 @@ xfs_clear_incompat_log_features( if (xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { - xfs_info(mp, "Clearing log incompat feature flags."); xfs_sb_remove_incompat_log_features(&mp->m_sb); ret = true; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c42786e4942..ba5d42abf66e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -391,6 +391,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 +/* Kernel has logged a warning about online fsck being used on this fs. */ +#define XFS_OPSTATE_WARNED_SCRUB 7 +/* Kernel has logged a warning about shrink being used on this fs. */ +#define XFS_OPSTATE_WARNED_SHRINK 8 +/* Kernel has logged a warning about logged xattr updates being used. */ +#define XFS_OPSTATE_WARNED_LARP 9 + #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ { \ @@ -413,6 +420,12 @@ __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +static inline bool +xfs_should_warn(struct xfs_mount *mp, long nr) +{ + return !test_and_set_bit(nr, &mp->m_opstate); +} + #define XFS_OPSTATE_STRINGS \ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ @@ -420,7 +433,10 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ - { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ + { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 8fc813cb6011..abf08bbf34a9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1308,8 +1308,15 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) + if (error) { + /* + * The inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and + * tearing down the quotainfo, or else the dquots will leak. + */ + xfs_qm_dqpurge_all(mp); goto error_return; + } /* * We've made all the changes that we need to make incore. Flush them diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8495ef076ffc..ed18160e6181 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,8 @@ #include "xfs_pwork.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -2079,8 +2081,24 @@ xfs_init_caches(void) if (!xfs_bui_cache) goto out_destroy_bud_cache; + xfs_attrd_cache = kmem_cache_create("xfs_attrd_item", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + if (!xfs_attrd_cache) + goto out_destroy_bui_cache; + + xfs_attri_cache = kmem_cache_create("xfs_attri_item", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + if (!xfs_attri_cache) + goto out_destroy_attrd_cache; + return 0; + out_destroy_attrd_cache: + kmem_cache_destroy(xfs_attrd_cache); + out_destroy_bui_cache: + kmem_cache_destroy(xfs_bui_cache); out_destroy_bud_cache: kmem_cache_destroy(xfs_bud_cache); out_destroy_cui_cache: @@ -2127,6 +2145,8 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_attri_cache); + kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); kmem_cache_destroy(xfs_bud_cache); kmem_cache_destroy(xfs_cui_cache); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 167d23f92ffe..3cd5a51bace1 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -91,7 +91,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; -extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 7a044afd4c46..35e13e125ec6 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -15,9 +15,86 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" +#include "xfs_log.h" +#include "xfs_xattr.h" #include <linux/posix_acl_xattr.h> +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +static inline int +xfs_attr_grab_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, + "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} + +static inline void +xfs_attr_rele_log_assist( + struct xfs_mount *mp) +{ + xlog_drop_incompat_feat(mp->m_log); +} + +/* + * Set or remove an xattr, having grabbed the appropriate logging resources + * prior to calling libxfs. + */ +int +xfs_attr_change( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + bool use_logging = false; + int error; + + if (xfs_has_larp(mp)) { + error = xfs_attr_grab_log_assist(mp); + if (error) + return error; + + use_logging = true; + } + + error = xfs_attr_set(args); + + if (use_logging) + xfs_attr_rele_log_assist(mp); + return error; +} + static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, @@ -56,7 +133,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h new file mode 100644 index 000000000000..2b09133b1b9b --- /dev/null +++ b/fs/xfs/xfs_xattr.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_XATTR_H__ +#define __XFS_XATTR_H__ + +int xfs_attr_change(struct xfs_da_args *args); + +extern const struct xattr_handler *xfs_xattr_handlers[]; + +#endif /* __XFS_XATTR_H__ */ |
