From f791357330b0043ec953ce122ab7519af4b9d24a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 5 Aug 2022 12:33:03 +0800 Subject: ceph: wake up the waiters if any new caps comes When new caps comes we need to wake up the waiters and also when revoking the caps, there also could be new caps comes. Link: https://tracker.ceph.com/issues/54044 Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 53cfe026b3ea..0ddd91eadbce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -754,6 +754,7 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; + wake_up_all(&ci->i_cap_wq); } /* @@ -3550,6 +3551,9 @@ static void handle_cap_grant(struct inode *inode, check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ + /* If there is new caps, try to wake up the waiters */ + if (~cap->issued & newcaps) + wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { -- cgit v1.2.3 From 6eb06c46214d33c71ae86d60b3fc9cb17c20beca Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 27 Jul 2022 12:29:10 +0800 Subject: ceph: fail the request if the peer MDS doesn't support getvxattr op Just fail the request instead sending the request out, or the peer MDS will crash. Link: https://tracker.ceph.com/issues/56529 Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 11 +++++++++++ fs/ceph/mds_client.h | 6 +++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 42351d7a0dd6..b4a3cb07d5b0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2356,6 +2356,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, goto out; } + req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR; req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { err = -ENOMEM; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 80f8b9ec1a31..26a0a8b9975e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2318,6 +2318,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; + req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); @@ -2916,6 +2917,16 @@ static void __do_request(struct ceph_mds_client *mdsc, dout("do_request mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); + + /* + * The old ceph will crash the MDSs when see unknown OPs + */ + if (req->r_feature_needed > 0 && + !test_bit(req->r_feature_needed, &session->s_features)) { + err = -EOPNOTSUPP; + goto out_session; + } + if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 256e3eada6c1..0598faa50e2e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -31,8 +31,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_OP_GETVXATTR, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,6 +45,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ + CEPHFS_FEATURE_OP_GETVXATTR, \ } /* @@ -336,6 +338,8 @@ struct ceph_mds_request { long long r_dir_ordered_cnt; int r_readdir_cache_idx; + int r_feature_needed; + struct ceph_cap_reservation r_caps_reservation; }; -- cgit v1.2.3 From 7c3ea9870e09e193981695dd67c37a1a2b6d600b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 11 Aug 2022 13:00:53 +0800 Subject: ceph: no need to wait for transition RDCACHE|RD -> RD For write when trying to get the Fwb caps we need to keep waiting on transition from WRBUFFER|WR -> WR to avoid a new WR sync write from going before a prior buffered writeback happens. While for read there is no need to wait on transition from RDCACHE|RD -> RD, and we can just exclude the revoking caps and force to sync read. Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0ddd91eadbce..0dc1251c3c6d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2760,13 +2760,17 @@ again: * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. + * + * For RDCACHE|RD -> RD, there is not need to wait and we can + * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; + int exclude = revoking & not; dout("get_cap_refs %p have %s but not %s (revoking %s)\n", inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); - if ((revoking & not) == 0) { + if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { @@ -2788,7 +2792,7 @@ again: snap_rwsem_locked = true; } if ((have & want) == want) - *got = need | want; + *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); -- cgit v1.2.3 From aa1d627207cace003163dee24d1c06fa4e910c6b Mon Sep 17 00:00:00 2001 From: Kenneth Lee Date: Thu, 18 Aug 2022 22:42:55 -0700 Subject: ceph: Use kcalloc for allocating multiple elements Prefer using kcalloc(a, b) over kzalloc(a * b) as this improves semantics since kcalloc is intended for allocating an array of memory. Signed-off-by: Kenneth Lee Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0dc1251c3c6d..fb023f9fafcb 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2286,7 +2286,7 @@ retry: struct ceph_mds_request *req; int i; - sessions = kzalloc(max_sessions * sizeof(s), GFP_KERNEL); + sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { err = -ENOMEM; goto out; -- cgit v1.2.3 From b4b924c7a16e857b0715603456045251a49f2ea6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 24 Aug 2022 09:24:42 -0400 Subject: ceph: increment i_version when doing a setattr with caps When the client has enough caps to satisfy a setattr locally without having to talk to the server, we currently do the setattr without incrementing the change attribute. Ensure that if the ctime changes locally, then the change attribute does too. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b4a3cb07d5b0..a5e2eb5704c9 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2192,6 +2192,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); inode->i_ctime = attr->ia_ctime; + inode_inc_iversion_raw(inode); } release &= issued; -- cgit v1.2.3 From bd04b9192e1ff6859d6b3906e91cfd5c9b0ad55b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 30 Aug 2022 22:49:36 +0800 Subject: ceph: fail the open_by_handle_at() if the dentry is being unlinked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unlinking a file the kclient will send a unlink request to MDS by holding the dentry reference, and then the MDS will return 2 replies, which are unsafe reply and a deferred safe reply. After the unsafe reply received the kernel will return and succeed the unlink request to user space apps. Only when the safe reply received the dentry's reference will be released. Or the dentry will only be unhashed from dcache. But when the open_by_handle_at() begins to open the unlinked files it will succeed. The inode->i_count couldn't be used to check whether the inode is opened or not. Link: https://tracker.ceph.com/issues/56524 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-by: Luís Henriques Tested-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e0fa66ac8b9f..f780e4e0d062 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -181,6 +181,7 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino) static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct inode *inode = __lookup_inode(sb, ino); + struct ceph_inode_info *ci = ceph_inode(inode); int err; if (IS_ERR(inode)) @@ -192,7 +193,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(err); } /* -ESTALE if inode as been unlinked and no file is open */ - if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { + if ((inode->i_nlink == 0) && !__ceph_is_file_opened(ci)) { iput(inode); return ERR_PTR(-ESTALE); } -- cgit v1.2.3 From aa87052dd965a6094355fcc13d5abc3f5bebfbe4 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 31 Aug 2022 12:13:28 +0800 Subject: ceph: fix incorrectly showing the .snap size for stat We should set the 'stat->size' to the real number of snapshots for snapdirs. Link: https://tracker.ceph.com/issues/57342 Signed-off-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a5e2eb5704c9..9ebb7cee7978 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2449,6 +2449,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); + struct super_block *sb = inode->i_sb; struct ceph_inode_info *ci = ceph_inode(inode); u32 valid_mask = STATX_BASIC_STATS; int err = 0; @@ -2478,16 +2479,34 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, } if (ceph_snap(inode) == CEPH_NOSNAP) - stat->dev = inode->i_sb->s_dev; + stat->dev = sb->s_dev; else stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), - RBYTES)) + if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { stat->size = ci->i_rbytes; - else + } else if (ceph_snap(inode) == CEPH_SNAPDIR) { + struct ceph_inode_info *pci; + struct ceph_snap_realm *realm; + struct inode *parent; + + parent = ceph_lookup_inode(sb, ceph_ino(inode)); + if (!parent) + return PTR_ERR(parent); + + pci = ceph_inode(parent); + spin_lock(&pci->i_ceph_lock); + realm = pci->i_snap_realm; + if (realm) + stat->size = realm->num_snaps; + else + stat->size = 0; + spin_unlock(&pci->i_ceph_lock); + iput(parent); + } else { stat->size = ci->i_files + ci->i_subdirs; + } stat->blocks = 0; stat->blksize = 65536; /* -- cgit v1.2.3