diff options
Diffstat (limited to 'fs')
61 files changed, 1525 insertions, 1225 deletions
diff --git a/fs/attr.c b/fs/attr.c index dbe996b0dedf..b5b8835ddf15 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -22,7 +22,7 @@ * chown_ok - verify permissions to chown inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @uid: uid to chown @inode to + * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -31,15 +31,15 @@ * performed on the raw inode simply passs init_user_ns. */ static bool chown_ok(struct user_namespace *mnt_userns, - const struct inode *inode, - kuid_t uid) + const struct inode *inode, vfsuid_t ia_vfsuid) { - kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && + vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(kuid, INVALID_UID) && + if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, * chgrp_ok - verify permissions to chgrp inode * @mnt_userns: user namespace of the mount @inode was found from * @inode: inode to check permissions on - * @gid: gid to chown @inode to + * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the user namespace of * the vfsmount must be passed through @mnt_userns. This function will then @@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns, * performed on the raw inode simply passs init_user_ns. */ static bool chgrp_ok(struct user_namespace *mnt_userns, - const struct inode *inode, kgid_t gid) + const struct inode *inode, vfsgid_t ia_vfsgid) { - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { - kgid_t mapped_gid; - - if (gid_eq(gid, inode->i_gid)) + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { + if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; - mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); - if (in_group_p(mapped_gid)) + if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(kgid, INVALID_GID) && + if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && + !chown_ok(mnt_userns, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && + !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - kgid_t mapped_gid; + vfsgid_t vfsgid; if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (ia_valid & ATTR_GID) - mapped_gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), attr->ia_gid); + vfsgid = attr->ia_vfsgid; else - mapped_gid = i_gid_into_mnt(mnt_userns, inode); + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); /* Also check the setgid bit! */ - if (!in_group_p(mapped_gid) && + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -219,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok); * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr on idmapped mounts. If file ownership is changed setattr_copy - * doesn't map ia_uid and ia_gid. It will asssume the caller has already - * provided the intended values. Necessary permission checks to determine + * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex @@ -242,10 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -254,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(kgid) && + vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (!vfsgid_in_group_p(vfsgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; @@ -306,9 +301,6 @@ EXPORT_SYMBOL(may_setattr); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * - * If file ownership is changed notify_change() doesn't map ia_uid and - * ia_gid. It will asssume the caller has already provided the intended values. - * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding @@ -397,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, * namespace of the superblock. */ if (ia_valid & ATTR_UID && - !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && - !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns, + attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && - !uid_valid(i_uid_into_mnt(mnt_userns, inode))) + !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && - !gid_valid(i_gid_into_mnt(mnt_userns, inode))) + !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode))) return -EOVERFLOW; - error = security_inode_setattr(dentry, attr); + error = security_inode_setattr(mnt_userns, dentry, attr); if (error) return error; error = try_break_deleg(inode, delegated_inode); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index ee92634196a8..1105ce3c80cb 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,15 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. +config DLM_DEPRECATED_API + bool "DLM deprecated API" + depends on DLM + help + Enables deprecated DLM timeout features that will be removed in + later Linux kernel releases. + + If you are unsure, say N. + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 3545fdafc6fb..71dab733cf9a 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -9,7 +9,6 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ - netlink.o \ lowcomms.o \ plock.o \ rcom.o \ @@ -18,5 +17,6 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o +dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index bfac462dd3e8..19ef136f9e4f 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work) if (callbacks[i].flags & DLM_CB_SKIP) { continue; } else if (callbacks[i].flags & DLM_CB_BAST) { - bastfn(lkb->lkb_astparam, callbacks[i].mode); trace_dlm_bast(ls, lkb, callbacks[i].mode); + bastfn(lkb->lkb_astparam, callbacks[i].mode); } else if (callbacks[i].flags & DLM_CB_CAST) { lkb->lkb_lksb->sb_status = callbacks[i].sb_status; lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags; + trace_dlm_ast(ls, lkb); castfn(lkb->lkb_astparam); - trace_dlm_ast(ls, lkb, lkb->lkb_lksb); } } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 42eee2783756..ac8b62106ce0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,8 +75,9 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned int cl_timewarn_cs; - unsigned int cl_waitwarn_us; +#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -102,8 +103,9 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR_TIMEWARN_CS, - CLUSTER_ATTR_WAITWARN_US, +#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR(timewarn_cs, dlm_check_zero); -CLUSTER_ATTR(waitwarn_us, NULL); +#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, +#ifdef CONFIG_DLM_DEPRECATED_API [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, - [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, +#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; +#ifdef CONFIG_DLM_DEPRECATED_API cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; - cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; +#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 +#ifdef CONFIG_DLM_DEPRECATED_API #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ -#define DEFAULT_WAITWARN_US 0 +#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, - .ci_waitwarn_us = DEFAULT_WAITWARN_US, +#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index df92b0a07fc6..55c5f2c13ebd 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,8 +37,9 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; +#ifdef CONFIG_DLM_DEPRECATED_API int ci_timewarn_cs; - int ci_waitwarn_us; +#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 776c3ed519f0..8aca8085d24e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -145,7 +145,9 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned long timeout; +#endif }; @@ -203,10 +205,20 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#ifdef CONFIG_DLM_DEPRECATED_API #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ +/* least significant 2 bytes are message changed, they are full transmitted + * but at receive side only the 2 bytes LSB will be set. + * + * Even wireshark dlm dissector does only evaluate the lower bytes and note + * that they may not be used on transceiver side, we assume the higher bytes + * are for internal use or reserved so long they are not parsed on receiver + * side. + */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -249,10 +261,12 @@ struct dlm_lkb { struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_ownqueue; /* list of locks for a process */ - struct list_head lkb_time_list; ktime_t lkb_timestamp; - ktime_t lkb_wait_time; + +#ifdef CONFIG_DLM_DEPRECATED_API + struct list_head lkb_time_list; unsigned long lkb_timeout_cs; +#endif struct mutex lkb_cb_mutex; struct work_struct lkb_cb_work; @@ -568,8 +582,10 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; +#ifdef CONFIG_DLM_DEPRECATED_API struct mutex ls_timeout_mutex; struct list_head ls_timeout; +#endif spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; @@ -606,8 +622,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; - struct completion ls_members_done; - int ls_members_result; + struct completion ls_recovery_done; + int ls_recovery_result; struct miscdevice ls_device; @@ -688,7 +704,9 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 +#ifdef CONFIG_DLM_DEPRECATED_API #define LSFL_TIMEWARN 8 +#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +#else +static inline int dlm_netlink_init(void) { return 0; } +static inline void dlm_netlink_exit(void) { }; +static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; +#endif int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 226822f49d30..dac7eb75dba9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); +#ifdef CONFIG_DLM_DEPRECATED_API /* if the operation was a cancel, then return -DLM_ECANCEL, if a timeout caused the cancel then return -ETIMEDOUT */ if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; rv = -ETIMEDOUT; } +#endif if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; @@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&lkb->lkb_time_list); +#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb) kref_get(&lkb->lkb_ref); } +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + /* This is called when we need to remove a reference and are certain it's not the last ref. e.g. del_lkb is always called between a find_lkb/put_lkb and is always the inverse of a previous add_lkb. @@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb) static inline void unhold_lkb(struct dlm_lkb *lkb) { - int rv; - rv = kref_put(&lkb->lkb_ref, kill_lkb); - DLM_ASSERT(!rv, dlm_print_lkb(lkb);); + kref_put(&lkb->lkb_ref, unhold_lkb_assert); } static void lkb_add_ordered(struct list_head *new, struct list_head *head, @@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype) return -1; } -static int nodeid_warned(int nodeid, int num_nodes, int *warned) -{ - int i; - - for (i = 0; i < num_nodes; i++) { - if (!warned[i]) { - warned[i] = nodeid; - return 0; - } - if (warned[i] == nodeid) - return 1; - } - return 0; -} - -void dlm_scan_waiters(struct dlm_ls *ls) -{ - struct dlm_lkb *lkb; - s64 us; - s64 debug_maxus = 0; - u32 debug_scanned = 0; - u32 debug_expired = 0; - int num_nodes = 0; - int *warned = NULL; - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (!lkb->lkb_wait_time) - continue; - - debug_scanned++; - - us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); - - if (us < dlm_config.ci_waitwarn_us) - continue; - - lkb->lkb_wait_time = 0; - - debug_expired++; - if (us > debug_maxus) - debug_maxus = us; - - if (!num_nodes) { - num_nodes = ls->ls_num_nodes; - warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); - } - if (!warned) - continue; - if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) - continue; - - log_error(ls, "waitwarn %x %lld %d us check connection to " - "node %d", lkb->lkb_id, (long long)us, - dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); - } - mutex_unlock(&ls->ls_waiters_mutex); - kfree(warned); - - if (debug_expired) - log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", - debug_scanned, debug_expired, - dlm_config.ci_waitwarn_us, (long long)debug_maxus); -} - /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ @@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; - lkb->lkb_wait_time = ktime_get(); lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); @@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +#ifdef CONFIG_DLM_DEPRECATED_API static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; @@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); - - if (!dlm_config.ci_waitwarn_us) - return; - - mutex_lock(&ls->ls_waiters_mutex); - list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (ktime_to_us(lkb->lkb_wait_time)) - lkb->lkb_wait_time = ktime_get(); - } - mutex_unlock(&ls->ls_waiters_mutex); } +#else +static void add_timeout(struct dlm_lkb *lkb) { } +static void del_timeout(struct dlm_lkb *lkb) { } +#endif /* lkb is master or local copy */ @@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error) } } +#ifdef CONFIG_DLM_DEPRECATED_API static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, unsigned long timeout_cs, void (*ast) (void *astparam), void *astparam, void (*bast) (void *astparam, int mode), struct dlm_args *args) +#else +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +#endif { int rv = -EINVAL; @@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; +#ifdef CONFIG_DLM_DEPRECATED_API args->timeout = timeout_cs; +#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; +#ifdef CONFIG_DLM_DEPRECATED_API lkb->lkb_timeout_cs = args->timeout; +#endif rv = 0; out: if (rv) @@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error) goto out; - trace_dlm_lock_start(ls, lkb, mode, flags); + trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); +#else + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); +#endif if (error) goto out_put; @@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace, if (error == -EINPROGRESS) error = 0; out_put: - trace_dlm_lock_end(ls, lkb, mode, flags, error); + trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error); if (convert || error) __put_lkb(ls, lkb); @@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs) +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs) +#else +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) goto out_put; @@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid) + uint32_t *lkid) { struct dlm_lkb *lkb = NULL, *iter; struct dlm_user_args *ua; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 252a5898f908..a7b6474f009d 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -void dlm_scan_waiters(struct dlm_ls *ls); + +#ifdef CONFIG_DLM_DEPRECATED_API void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); +#else +static inline void dlm_scan_timeout(struct dlm_ls *ls) { } +static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } +#endif + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs, uint32_t *lkid); + uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 19ed41a5da93..3972f4d86c75 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -275,7 +275,6 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); - dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; @@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) +#ifdef CONFIG_DLM_DEPRECATED_API + if (flags & DLM_LSFL_TIMEWARN) { + pr_warn_once("===============================================================\n" + "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" + " will be removed in v6.2!\n" + " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" + "===============================================================\n"); + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + } /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have some flags set */ + * need to require all nodes to have some flags set + */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#else + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); +#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); @@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; - init_completion(&ls->ls_members_done); - ls->ls_members_result = -1; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; mutex_init(&ls->ls_cb_mutex); INIT_LIST_HEAD(&ls->ls_cb_delay); @@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster, if (error) goto out_recoverd; - wait_for_completion(&ls->ls_members_done); - error = ls->ls_members_result; + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; if (error) goto out_members; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 19e82f08c0e0..a4e84e8d94c8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk) return; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) { - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); queue_work(send_workqueue, &con->swork); return; } @@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock, return ret; if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); + log_print("connected to node %d", con->nodeid); return 0; } diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 98084e0cfccf..2af2ccfe43a9 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) int i, error, neg = 0, low = -1; /* previously removed members that we've not finished removing need to - count as a negative change so the "neg" recovery steps will happen */ + * count as a negative change so the "neg" recovery steps will happen + * + * This functionality must report all member changes to lsops or + * midcomms layer and must never return before. + */ list_for_each_entry(memb, &ls->ls_nodes_gone, list) { log_rinfo(ls, "prev removed member %d", memb->nodeid); @@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - /* error -EINTR means that a new recovery action is triggered. - * We ignore this recovery action and let run the new one which might - * have new member configuration. - */ - if (error == -EINTR) - error = 0; - - /* new_lockspace() may be waiting to know if the config - * is good or bad - */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } @@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls) if (!ls->ls_recover_begin) ls->ls_recover_begin = jiffies; - dlm_lsop_recover_prep(ls); + /* call recover_prep ops only once and not multiple times + * for each possible dlm_ls_stop() when recovery is already + * stopped. + * + * If we successful was able to clear LSFL_RUNNING bit and + * it was set we know it is the first dlm_ls_stop() call. + */ + if (new) + dlm_lsop_recover_prep(ls); + return 0; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 0993eebf2060..737f185aad8d 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -29,6 +29,8 @@ struct plock_async_data { struct plock_op { struct list_head list; int done; + /* if lock op got interrupted while waiting dlm_controld reply */ + bool sigint; struct dlm_plock_info info; /* if set indicates async handling */ struct plock_async_data *data; @@ -79,8 +81,7 @@ static void send_op(struct plock_op *op) abandoned waiter. So, we have to insert the unlock-close when the lock call is interrupted. */ -static void do_unlock_close(struct dlm_ls *ls, u64 number, - struct file *file, struct file_lock *fl) +static void do_unlock_close(const struct dlm_plock_info *info) { struct plock_op *op; @@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number, return; op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = fl->fl_pid; - op->info.fsid = ls->ls_global_id; - op->info.number = number; + op->info.pid = info->pid; + op->info.fsid = info->fsid; + op->info.number = info->number; op->info.start = 0; op->info.end = OFFSET_MAX; - if (fl->fl_lmops && fl->fl_lmops->lm_grant) - op->info.owner = (__u64) fl->fl_pid; - else - op->info.owner = (__u64)(long) fl->fl_owner; + op->info.owner = info->owner; op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); @@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); - list_del(&op->list); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + + op->sigint = true; spin_unlock(&ops_lock); - log_print("%s: wait interrupted %x %llx, op removed", + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, - (unsigned long long)number); - dlm_release_plock_op(op); - do_unlock_close(ls, number, file, fl); + (unsigned long long)number, op->info.pid); goto out; } +do_lock_wait: + WARN_ON(!list_empty(&op->list)); rv = op->info.rv; @@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { - op = list_entry(send_list.next, struct plock_op, list); + op = list_first_entry(&send_list, struct plock_op, list); if (op->info.flags & DLM_PLOCK_FL_CLOSE) list_del(&op->list); else @@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (iter->info.fsid == info.fsid && iter->info.number == info.number && iter->info.owner == info.owner) { + if (iter->sigint) { + list_del(&iter->list); + spin_unlock(&ops_lock); + + pr_debug("%s: sigint cleanup %x %llx pid %d", + __func__, iter->info.fsid, + (unsigned long long)iter->info.number, + iter->info.pid); + do_unlock_close(&iter->info); + memcpy(&iter->info, &info, sizeof(info)); + dlm_release_plock_op(iter); + return count; + } list_del_init(&iter->list); memcpy(&iter->info, &info, sizeof(info)); if (iter->data) @@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, else wake_up(&recv_wq); } else - log_print("%s: no op %x %llx - may got interrupted?", __func__, + log_print("%s: no op %x %llx", __func__, info.fsid, (unsigned long long)info.number); return count; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index a55dfce705dd..e15eb511b04b 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) /* * Add or remove nodes from the lockspace's ls_nodes list. + * + * Due to the fact that we must report all membership changes to lsops + * or midcomms layer, it is not permitted to abort ls_recover() until + * this is done. */ error = dlm_recover_members(ls, rv, &neg); @@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); - dlm_lsop_recover_done(ls); return 0; fail: dlm_release_root_list(ls); - log_rinfo(ls, "dlm_recover %llu error %d", - (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); + return error; } @@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; + int error; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; @@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_unlock(&ls->ls_recover_lock); if (rv) { - ls_recover(ls, rv); + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); + + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + kfree(rv->nodes); kfree(rv); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1060b24f18d4..99e8f0744513 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } +#ifdef CONFIG_DLM_DEPRECATED_API + if (params->timeout) + pr_warn_once("========================================================\n" + "WARNING: the lkb timeout feature is being deprecated and\n" + " will be removed in v6.2!\n" + "========================================================\n"); +#endif + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; @@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); +#else + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); +#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, params->name, params->namelen, - (unsigned long) params->timeout, &lkid); if (!error) error = lkid; } else { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, (unsigned long) params->timeout); +#else + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); +#endif if (!error) error = ua->lksb.sb_lkid; } diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 19e6c56a9f47..26fa170090b8 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -17,7 +17,7 @@ struct z_erofs_decompress_req { /* indicate the algorithm will be used for decompression */ unsigned int alg; - bool inplace_io, partial_decoding; + bool inplace_io, partial_decoding, fillgaps; }; struct z_erofs_decompressor { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fbb037ba326e..fe8ac0e163f7 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) return iomap_bmap(mapping, block, &erofs_iomap_ops); } -static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - loff_t align = iocb->ki_pos | iov_iter_count(to) | - iov_iter_alignment(to); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int blksize_mask; - - if (bdev) - blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; - else - blksize_mask = (1 << inode->i_blkbits) - 1; - if (align & blksize_mask) - return -EINVAL; - return 0; -} - -static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ /* no need taking (shared) inode lock since it's a ro filesystem */ if (!iov_iter_count(to)) return 0; #ifdef CONFIG_FS_DAX - if (IS_DAX(iocb->ki_filp->f_mapping->host)) + if (IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if (iocb->ki_flags & IOCB_DIRECT) { - int err = erofs_prepare_dio(iocb, to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = bdev_logical_block_size(bdev) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if ((iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to)) & blksize_mask) + return -EINVAL; - if (!err) - return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0, NULL, 0); - if (err < 0) - return err; + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0, NULL, 0); } return filemap_read(iocb, to, 0); } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 6dca1900c733..2d55569f96ac 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, j = 0; /* 'valid' bounced can only be tested after a complete round */ - if (test_bit(j, bounced)) { + if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; @@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, if (page) { __clear_bit(j, bounced); - if (kaddr) { - if (kaddr + PAGE_SIZE == page_address(page)) + if (!PageHighMem(page)) { + if (!i) { + kaddr = page_address(page); + continue; + } + if (kaddr && + kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; - else - kaddr = NULL; - } else if (!i) { - kaddr = page_address(page); + continue; + } } + kaddr = NULL; continue; } kaddr = NULL; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 05a3063cf2bc..5e59b3f523eb 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -143,6 +143,7 @@ again: DBG_BUGON(z_erofs_lzma_head); z_erofs_lzma_head = head; spin_unlock(&z_erofs_lzma_lock); + wake_up_all(&z_erofs_lzma_wq); z_erofs_lzma_max_dictsize = dict_size; mutex_unlock(&lzma_resize_mutex); diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 18e59821c597..ecf28f66b97d 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name, } static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, - void *dentry_blk, unsigned int *ofs, + void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) { - struct erofs_dirent *de = dentry_blk + *ofs; const struct erofs_dirent *end = dentry_blk + nameoff; while (de < end) { @@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, /* stopped by some reason */ return 1; ++de; - *ofs += sizeof(struct erofs_dirent); + ctx->pos += sizeof(struct erofs_dirent); } - *ofs = maxsize; return 0; } @@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) nameoff = le16_to_cpu(de->nameoff); if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= PAGE_SIZE) { + nameoff >= EROFS_BLKSIZ) { erofs_err(dir->i_sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; - goto skip_this; + break; } maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, PAGE_SIZE); + dirsize - ctx->pos + ofs, EROFS_BLKSIZ); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); + ctx->pos = blknr_to_addr(i) + ofs; if (ofs >= nameoff) goto skip_this; } - err = erofs_fill_dentries(dir, ctx, de, &ofs, + err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs, nameoff, maxsize); -skip_this: - ctx->pos = blknr_to_addr(i) + ofs; - if (err) break; +skip_this: + ctx->pos = blknr_to_addr(i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 724bb57075f6..5792ca9e0d5e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" @@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + static void z_erofs_destroy_pcluster_pool(void) { int i; @@ -46,7 +123,7 @@ static int z_erofs_create_pcluster_pool(void) for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { - size = struct_size(a, compressed_pages, pcs->maxpages); + size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, @@ -150,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void) return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| */ - COLLECT_PRIMARY_HOOKED, + Z_EROFS_PCLUSTER_HOOKED, /* - * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or - * pagevec) since it can be directly decoded without I/O submission. + * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -184,39 +260,36 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; - struct z_erofs_pagevec_ctor vector; - + struct page *candidate_bvpage; struct z_erofs_pcluster *pcl, *tailpcl; - /* a pointer used to pick up inplace I/O pages */ - struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; - - enum z_erofs_collectmode mode; + enum z_erofs_pclustermode mode; bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; + + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } - -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, enum z_erofs_cache_alloctype type, @@ -231,24 +304,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; - struct page **pages; - pgoff_t index; + unsigned int i; - if (fe->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - pages = pcl->compressed_pages; - index = pcl->obj.index; - for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); @@ -269,7 +339,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) @@ -283,7 +354,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * managed cache since it can be moved to the bypass queue instead. */ if (standalone) - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -300,7 +371,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -313,7 +384,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } @@ -323,56 +394,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret = 0; /* 0 - busy */ + int ret, i; - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } - erofs_workgroup_unfreeze(&pcl->obj, 1); - - if (ret) - detach_page_private(page); } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, - struct page *page) + struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; - while (fe->icpage_ptr > pcl->compressed_pages) - if (!cmpxchg(--fe->icpage_ptr, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } + } return false; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, - struct page *page, enum z_erofs_page_type type, - bool pvec_safereuse) + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - /* give priority for inplaceio */ - if (fe->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(fe, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, - pvec_safereuse); - fe->pcl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) @@ -385,7 +459,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ - f->mode = COLLECT_PRIMARY_FOLLOWED; + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } @@ -393,66 +467,21 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) * type 2, link to the end of an existing open chain, be careful * that its submission is controlled by the original attached chain. */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; f->tailpcl = NULL; return; } /* type 3, it belongs to a chain, but it isn't the end of the chain */ - f->mode = COLLECT_PRIMARY; + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } -static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) -{ - struct z_erofs_pcluster *pcl = fe->pcl; - unsigned int length; - - /* to avoid unexpected loop formed by corrupted images */ - if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } - mutex_lock(&pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; - - z_erofs_try_to_claim_pcluster(fe); - return 0; -} - -static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; @@ -471,14 +500,13 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, atomic_set(&pcl->obj.refcount, 1); pcl->algorithmformat = map->m_algorithmformat; - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; - fe->mode = COLLECT_PRIMARY_FOLLOWED; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others @@ -494,7 +522,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; - grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj); + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; @@ -520,11 +548,10 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { - struct erofs_workgroup *grp; + struct erofs_map_blocks *map = &fe->map; + struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); @@ -533,38 +560,35 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (map->m_flags & EROFS_MAP_META) { - if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - goto tailpacking; + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { + DBG_BUGON(1); + return -EFSCORRUPTED; } - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; } else { -tailpacking: - ret = z_erofs_register_pcluster(fe, inode, map); - if (!ret) - goto out; - if (ret != -EEXIST) - return ret; + ret = z_erofs_register_pcluster(fe); } - ret = z_erofs_lookup_pcluster(fe, inode, map); - if (ret) { - erofs_workgroup_put(&fe->pcl->obj); + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + /* used to check tail merging loop due to corrupted images */ + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = fe->pcl; + + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { return ret; } - -out: - z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->pcl->pagevec, fe->pcl->vcnt); + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ - fe->icpage_ptr = fe->pcl->compressed_pages + - z_erofs_pclusterpages(fe->pcl); + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -593,14 +617,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&fe->vector, false); + z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } + /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; @@ -628,11 +657,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -653,7 +681,7 @@ repeat: map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) - goto err_out; + goto out; } else { if (fe->pcl) goto hitted; @@ -663,9 +691,9 @@ repeat: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; - err = z_erofs_collector_begin(fe, inode, map); + err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; if (z_erofs_is_inline_pcluster(fe->pcl)) { void *mp; @@ -676,11 +704,12 @@ repeat: err = PTR_ERR(mp); erofs_err(inode->i_sb, "failed to get inline page, err %d", err); - goto err_out; + goto out; } get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); - fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, @@ -696,10 +725,10 @@ hitted: * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && - fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -707,60 +736,59 @@ hitted: goto next_part; } - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); - + exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: - err = z_erofs_attach_page(fe, page, page_type, - fe->mode >= COLLECT_PRIMARY_FOLLOWED); - /* should allocate an additional short-lived page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - alloc_page(GFP_NOFS | __GFP_NOFAIL); - - set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); - err = z_erofs_attach_page(fe, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) - goto err_out; - - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + goto out; + } + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, @@ -783,97 +811,137 @@ static bool z_erofs_page_is_invalidated(struct page *page) return !page->mapping && !z_erofs_is_shortlived_page(page); } -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct page **pagepool) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - struct z_erofs_pagevec_ctor ctor; - unsigned int i, inputsize, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; - enum z_erofs_page_type page_type; - bool overlapped, partial; - int err; + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; - might_sleep(); - DBG_BUGON(!READ_ONCE(pcl->nr_pages)); + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; - mutex_lock(&pcl->lock); - nr_pages = pcl->nr_pages; +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { - pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) +{ + struct z_erofs_bvec_item *item; - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; + struct page *oldpage; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + oldpage = be->decompressed_pages[pgnr]; + be->decompressed_pages[pgnr] = bvec->page; - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; - } + if (!oldpage) + return; } - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; - - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - pcl->pagevec, 0); - - for (i = 0; i < pcl->vcnt; ++i) { - unsigned int pagenr; + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} - page = z_erofs_pagevec_dequeue(&ctor, &page_type); +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; + } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); + } +} - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(z_erofs_page_is_invalidated(page)); +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; - if (z_erofs_put_shortlivedpage(pagepool, page)) - continue; + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - DBG_BUGON(pagenr >= nr_pages); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); } - z_erofs_pagevec_ctor_exit(&ctor, true); - overlapped = false; - compressed_pages = pcl->compressed_pages; + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; + + *overlapped = false; for (i = 0; i < pclusterpages; ++i) { - unsigned int pagenr; + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; - page = compressed_pages[i]; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); + /* compressed pages ought to be present before decompressing */ + if (!page) { + DBG_BUGON(1); + continue; + } + be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) @@ -883,109 +951,129 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { - if (erofs_page_is_managed(sbi, page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; + } + } - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; + if (err) + return err; + return 0; +} - overlapped = true; - } +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; - /* PG_error needs checking for all non-managed pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); } + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; - partial = true; - } - if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, - .outputsize = outputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial - }, pagepool); + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { - page = compressed_pages[0]; - WRITE_ONCE(compressed_pages[0], NULL); + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = compressed_pages[i]; + page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ - (void)z_erofs_put_shortlivedpage(pagepool, page); - WRITE_ONCE(compressed_pages[i], NULL); + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); - for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ - if (z_erofs_put_shortlivedpage(pagepool, page)) + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); - pcl->nr_pages = 0; + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ @@ -997,22 +1085,25 @@ out: static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); - erofs_workgroup_put(&pcl->obj); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -1038,7 +1129,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); - return; } @@ -1071,7 +1161,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) @@ -1087,7 +1177,7 @@ repeat: * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; @@ -1113,14 +1203,13 @@ repeat: /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1149,7 +1238,8 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; @@ -1186,6 +1276,7 @@ fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1246,26 +1337,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (err) - SetPageError(page); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } + if (err) + q->eio = true; z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); bio_put(bio); } -static void z_erofs_submit_queue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; @@ -1317,7 +1407,7 @@ static void z_erofs_submit_queue(struct super_block *sb, struct page *page; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi)); + mc); if (!page) continue; @@ -1369,15 +1459,14 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_decompress_frontend *f, +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1475,7 +1564,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) @@ -1524,7 +1613,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, + z_erofs_runqueue(&f, &pagepool, z_erofs_get_sync_decompress_policy(sbi, nr_pages)); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 58053bb5066f..e7f04c4fbb81 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -7,13 +7,10 @@ #define __EROFS_FS_ZDATA_H #include "internal.h" -#include "zpvec.h" +#include "tagptr.h" #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_NR_INLINE_PAGEVECS 3 - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 +#define Z_EROFS_INLINE_BVECS 2 /* * let's leave a type here in case of introducing @@ -21,6 +18,21 @@ */ typedef void *z_erofs_next_pcluster_t; +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + /* * Structure fields follow one of the following exclusion rules. * @@ -38,24 +50,21 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; - /* A: lower limit of decompressed length and if full length or not */ + /* L: the maximum decompression size of this round */ unsigned int length; + /* L: total number of bvecs */ + unsigned int vcnt; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; - /* L: maximum relative page index in pagevec[] */ - unsigned short nr_pages; - - /* L: total number of pages in pagevec[] */ - unsigned int vcnt; - union { - /* L: inline a certain number of pagevecs for bootstrap */ - erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; @@ -72,8 +81,14 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; - /* A: compressed pages (can be cached or inplaced pages) */ - struct page *compressed_pages[]; + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; }; /* let's avoid the valid 32-bit kernel addresses */ @@ -94,6 +109,8 @@ struct z_erofs_decompressqueue { struct completion done; struct work_struct work; } u; + + bool eio; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) @@ -108,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) return pcl->pclusterpages; } -#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 -#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) -#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) - /* - * waiters (aka. ongoing_packs): # to unlock the page - * sub-index: 0 - for partial page, >= 1 full page sub-index + * bit 31: I/O error occurred on this page + * bit 0 - 30: remaining parts to complete this page */ -typedef atomic_t z_erofs_onlinepage_t; - -/* type punning */ -union z_erofs_onlinepage_converter { - z_erofs_onlinepage_t *o; - unsigned long *v; -}; - -static inline unsigned int z_erofs_onlinepage_index(struct page *page) -{ - union z_erofs_onlinepage_converter u; - - DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; -} +#define Z_EROFS_PAGE_EIO (1 << 31) static inline void z_erofs_onlinepage_init(struct page *page) { union { - z_erofs_onlinepage_t o; + atomic_t o; unsigned long v; - /* keep from being unlocked in advance */ } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); @@ -147,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page) SetPagePrivate(page); } -static inline void z_erofs_onlinepage_fixup(struct page *page, - uintptr_t index, bool down) +static inline void z_erofs_onlinepage_split(struct page *page) { - union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; - int orig, orig_index, val; - -repeat: - orig = atomic_read(u.o); - orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (orig_index) { - if (!index) - return; + atomic_inc((atomic_t *)&page->private); +} - DBG_BUGON(orig_index != index); - } +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; - val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (atomic_cmpxchg(u.o, orig, val) != orig) - goto repeat; + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); } static inline void z_erofs_onlinepage_endio(struct page *page) { - union z_erofs_onlinepage_converter u; unsigned int v; DBG_BUGON(!PagePrivate(page)); - u.v = &page_private(page); - - v = atomic_dec_return(u.o); - if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); - if (!PageError(page)) + if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } - erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o)); } -#define Z_EROFS_VMAP_ONSTACK_PAGES \ - min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) -#define Z_EROFS_VMAP_GLOBAL_PAGES 2048 +#define Z_EROFS_ONSTACK_PAGES 32 #endif diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h deleted file mode 100644 index b05464f4a808..000000000000 --- a/fs/erofs/zpvec.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZPVEC_H -#define __EROFS_FS_ZPVEC_H - -#include "tagptr.h" - -/* page type in pagevec for decompress subsystem */ -enum z_erofs_page_type { - /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ - Z_EROFS_PAGE_TYPE_EXCLUSIVE, - - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, - - Z_EROFS_VLE_PAGE_TYPE_HEAD, - Z_EROFS_VLE_PAGE_TYPE_MAX -}; - -extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") - __bad_page_type_exclusive(void); - -/* pagevec tagged pointer */ -typedef tagptr2_t erofs_vtptr_t; - -/* pagevec collector */ -struct z_erofs_pagevec_ctor { - struct page *curr, *next; - erofs_vtptr_t *pages; - - unsigned int nr, index; -}; - -static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - if (!ctor->curr) - return; - - if (atomic) - kunmap_atomic(ctor->pages); - else - kunmap(ctor->curr); -} - -static inline struct page * -z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr) -{ - unsigned int index; - - /* keep away from occupied pages */ - if (ctor->next) - return ctor->next; - - for (index = 0; index < nr; ++index) { - const erofs_vtptr_t t = ctor->pages[index]; - const unsigned int tags = tagptr_unfold_tags(t); - - if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) - return tagptr_unfold_ptr(t); - } - DBG_BUGON(nr >= ctor->nr); - return NULL; -} - -static inline void -z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, - bool atomic) -{ - struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); - - z_erofs_pagevec_ctor_exit(ctor, atomic); - - ctor->curr = next; - ctor->next = NULL; - ctor->pages = atomic ? - kmap_atomic(ctor->curr) : kmap(ctor->curr); - - ctor->nr = PAGE_SIZE / sizeof(struct page *); - ctor->index = 0; -} - -static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, - unsigned int nr, - erofs_vtptr_t *pages, - unsigned int i) -{ - ctor->nr = nr; - ctor->curr = ctor->next = NULL; - ctor->pages = pages; - - if (i >= nr) { - i -= nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - while (i > ctor->nr) { - i -= ctor->nr; - z_erofs_pagevec_ctor_pagedown(ctor, false); - } - } - ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); - ctor->index = i; -} - -static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, - struct page *page, - enum z_erofs_page_type type, - bool pvec_safereuse) -{ - if (!ctor->next) { - /* some pages cannot be reused as pvec safely without I/O */ - if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) - type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; - - if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && - ctor->index + 1 == ctor->nr) - return false; - } - - if (ctor->index >= ctor->nr) - z_erofs_pagevec_ctor_pagedown(ctor, false); - - /* exclusive page type must be 0 */ - if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) - __bad_page_type_exclusive(); - - /* should remind that collector->next never equal to 1, 2 */ - if (type == (uintptr_t)ctor->next) { - ctor->next = page; - } - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); - return true; -} - -static inline struct page * -z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, - enum z_erofs_page_type *type) -{ - erofs_vtptr_t t; - - if (ctor->index >= ctor->nr) { - DBG_BUGON(!ctor->next); - z_erofs_pagevec_ctor_pagedown(ctor, true); - } - - t = ctor->pages[ctor->index]; - - *type = tagptr_unfold_tags(t); - - /* should remind that collector->next never equal to 1, 2 */ - if (*type == (uintptr_t)ctor->next) - ctor->next = tagptr_unfold_ptr(t); - - ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); - return tagptr_unfold_ptr(t); -} -#endif diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e6b932219803..7a192e4e7fa9 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1679,14 +1679,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { error = dquot_initialize(inode); if (error) return error; } - if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || - (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - error = dquot_transfer(inode, iattr); + if (i_uid_needs_update(mnt_userns, iattr, inode) || + i_gid_needs_update(mnt_userns, iattr, inode)) { + error = dquot_transfer(mnt_userns, inode, iattr); if (error) return error; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..6f475d2e3b18 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1059,9 +1059,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_frags_per_group); goto failed_mount; } - if (sbi->s_inodes_per_group > sb->s_blocksize * 8) { + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, - "error: #inodes per group too big: %lu", + "error: invalid #inodes per group: %lu", sbi->s_inodes_per_group); goto failed_mount; } @@ -1071,6 +1072,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - le32_to_cpu(es->s_first_data_block) - 1) / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group != + le32_to_cpu(es->s_inodes_count)) { + ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu", + le32_to_cpu(es->s_inodes_count), + (u64)sbi->s_groups_count * sbi->s_inodes_per_group); + goto failed_mount; + } db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array(db_count, @@ -1490,8 +1498,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; @@ -1529,8 +1536,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head *bh; while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; + tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84c0eb55071d..3dcc1dd1f179 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5350,14 +5350,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (error) return error; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } - if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || - (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -5374,7 +5374,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { @@ -5383,10 +5383,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Update corresponding info in inode so that everything is in * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd14cef1b08f..d66e37d80a2d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns, { unsigned int ia_valid = attr->ia_valid; - if (ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_MTIME) @@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (err) return err; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { err = f2fs_dquot_initialize(inode); if (err) return err; } - if ((attr->ia_valid & ATTR_UID && - !uid_eq(attr->ia_uid, inode->i_uid)) || - (attr->ia_valid & ATTR_GID && - !gid_eq(attr->ia_gid, inode->i_gid))) { + if (i_uid_needs_update(mnt_userns, attr, inode) || + i_gid_needs_update(mnt_userns, attr, inode)) { f2fs_lock_op(F2FS_I_SB(inode)); - err = dquot_transfer(inode, attr); + err = dquot_transfer(mnt_userns, inode, attr); if (err) { set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); @@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * update uid/gid under lock_op(), so that dquot and inode can * be updated atomically. */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; + i_uid_update(mnt_userns, attr, inode); + i_gid_update(mnt_userns, attr, inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_unlock_op(F2FS_I_SB(inode)); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3cb7f8a43b4d..dcd0a1e35095 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page) memset(&attr, 0, sizeof(attr)); - attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); - attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid)); + attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid)); - if (!uid_eq(attr.ia_uid, inode->i_uid)) + if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode))) attr.ia_valid |= ATTR_UID; - if (!gid_eq(attr.ia_gid, inode->i_gid)) + if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode))) attr.ia_valid |= ATTR_GID; if (!attr.ia_valid) return 0; - err = dquot_transfer(inode, &attr); + err = dquot_transfer(&init_user_ns, inode, &attr); if (err) set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); return err; diff --git a/fs/fat/file.c b/fs/fat/file.c index 3dae3ed60f3a..3e4eb3467cb4 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) * out the RO attribute for checking by the security * module, just because it maps to a file mode. */ - err = security_inode_setattr(file->f_path.dentry, &ia); + err = security_inode_setattr(file_mnt_user_ns(file), + file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } if (((attr->ia_valid & ATTR_UID) && - (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) || + (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid), + sbi->options.fs_uid))) || ((attr->ia_valid & ATTR_GID) && - (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) || + (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid), + sbi->options.fs_gid))) || ((attr->ia_valid & ATTR_MODE) && (attr->ia_mode & ~FAT_VALID_MODE))) error = -EPERM; diff --git a/fs/inode.c b/fs/inode.c index bd4da9c5207e..259ebf438893 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns, return notify_change(mnt_userns, dentry, &newattrs, NULL); } -/* - * Remove special file priviledges (suid, capabilities) when file is written - * to or truncated. - */ -int file_remove_privs(struct file *file) +static int __file_remove_privs(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); + int error; int kill; - int error = 0; - /* - * Fast path for nothing security related. - * As well for non-regular files, e.g. blkdev inodes. - * For example, blkdev_write_iter() might get here - * trying to remove privs which it is not allowed to. - */ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); - if (kill < 0) + if (kill <= 0) return kill; - if (kill) - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_privs); /** - * file_update_time - update mtime and ctime time - * @file: file accessed + * file_remove_privs - remove special file privileges (suid, capabilities) + * @file: file to remove privileges from + * + * When file is modified by a write or truncation ensure that special + * file privileges are removed. * - * Update the mtime and ctime members of an inode and mark the inode - * for writeback. Note that this function is meant exclusively for - * usage in the file write path of filesystems, and filesystems may - * choose to explicitly ignore update via this function with the - * S_NOCMTIME inode flag, e.g. for network filesystem where these - * timestamps are handled by the server. This can return an error for - * file systems who need to allocate space in order to update an inode. + * Return: 0 on success, negative errno on failure. */ +int file_remove_privs(struct file *file) +{ + return __file_remove_privs(file, 0); +} +EXPORT_SYMBOL(file_remove_privs); -int file_update_time(struct file *file) +static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) { - struct inode *inode = file_inode(file); - struct timespec64 now; int sync_it = 0; - int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2079,37 +2069,127 @@ int file_update_time(struct file *file) if (!sync_it) return 0; - /* Finally allowed to write? Takes lock. */ - if (__mnt_want_write_file(file)) - return 0; + return sync_it; +} + +static int __file_update_time(struct file *file, struct timespec64 *now, + int sync_mode) +{ + int ret = 0; + struct inode *inode = file_inode(file); - ret = inode_update_time(inode, &now, sync_it); - __mnt_drop_write_file(file); + /* try to update time settings */ + if (!__mnt_want_write_file(file)) { + ret = inode_update_time(inode, now, sync_mode); + __mnt_drop_write_file(file); + } return ret; } + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode for + * writeback. Note that this function is meant exclusively for usage in + * the file write path of filesystems, and filesystems may choose to + * explicitly ignore updates via this function with the _NOCMTIME inode + * flag, e.g. for network filesystem where these imestamps are handled + * by the server. This can return an error for file systems who need to + * allocate space in order to update an inode. + * + * Return: 0 on success, negative errno on failure. + */ +int file_update_time(struct file *file) +{ + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); + + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + + return __file_update_time(file, &now, ret); +} EXPORT_SYMBOL(file_update_time); -/* Caller must hold the file's inode lock */ -int file_modified(struct file *file) +/** + * file_modified_flags - handle mandated vfs changes when modifying a file + * @file: file that was modified + * @flags: kiocb flags + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * If IOCB_NOWAIT is set, special file privileges will not be removed and + * time settings will not be updated. It will return -EAGAIN. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +static int file_modified_flags(struct file *file, int flags) { - int err; + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - err = file_remove_privs(file); - if (err) - return err; + ret = __file_remove_privs(file, flags); + if (ret) + return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - return file_update_time(file); + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + return __file_update_time(file, &now, ret); +} + +/** + * file_modified - handle mandated vfs changes when modifying a file + * @file: file that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int file_modified(struct file *file) +{ + return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); +/** + * kiocb_modified - handle mandated vfs changes when modifying a file + * @iocb: iocb that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int kiocb_modified(struct kiocb *iocb) +{ + return file_modified_flags(iocb->ki_filp, iocb->ki_flags); +} +EXPORT_SYMBOL_GPL(kiocb_modified); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d2a9f699e17e..c681eacc389b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio) static struct bio_set iomap_ioend_bioset; static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio) +iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) { struct iomap_page *iop = to_iomap_page(folio); unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + gfp_t gfp; if (iop || nr_blocks <= 1) return iop; + if (flags & IOMAP_NOWAIT) + gfp = GFP_NOWAIT; + else + gfp = GFP_NOFS | __GFP_NOFAIL; + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - GFP_NOFS | __GFP_NOFAIL); - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); + gfp); + if (iop) { + spin_lock_init(&iop->uptodate_lock); + if (folio_test_uptodate(folio)) + bitmap_fill(iop->uptodate, nr_blocks); + folio_attach_private(folio, iop); + } return iop; } @@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); else iop = to_iomap_page(folio); @@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; @@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop = iomap_page_create(iter->inode, folio); + struct iomap_page *iop; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); + unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; @@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; folio_clear_error(folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + return -EAGAIN; + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return -EIO; folio_zero_segments(folio, poff, from, to, poff + plen); } else { - int status = iomap_read_folio_sync(block_start, folio, + int status; + + if (iter->flags & IOMAP_NOWAIT) + return -EAGAIN; + + status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; @@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); if (!folio) { - status = -ENOMEM; + status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } if (pos + len > folio_pos(folio) + folio_size(folio)) @@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) loff_t pos = iter->pos; ssize_t written = 0; long status = 0; + struct address_space *mapping = iter->inode->i_mapping; + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; @@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: + status = balance_dirty_pages_ratelimited_flags(mapping, + bdp_flags); + if (unlikely(status)) + break; + if (bytes > length) bytes = length; @@ -760,6 +788,10 @@ again: * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. + * + * For async buffered writes the assumption is that the user + * page has already been faulted in. This can be optimized by + * faulting the user page. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; @@ -771,7 +803,7 @@ again: break; page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(iter->inode->i_mapping)) + if (mapping_writably_mapped(mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); @@ -796,10 +828,12 @@ again: pos += status; written += status; length -= status; - - balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); + if (status == -EAGAIN) { + iov_iter_revert(i, written); + return -EAGAIN; + } return written ? written : status; } @@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, }; int ret; + if (iocb->ki_flags & IOCB_NOWAIT) + iter.flags |= IOMAP_NOWAIT; + while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); if (iter.pos == iocb->ki_pos) @@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio); + struct iomap_page *iop = iomap_page_create(inode, folio, 0); struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 1d732fd223d4..332dc9ac47a9 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (rc) return rc; - if (is_quota_modification(inode, iattr)) { + if (is_quota_modification(mnt_userns, inode, iattr)) { rc = dquot_initialize(inode); if (rc) return rc; } if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { - rc = dquot_transfer(inode, iattr); + rc = dquot_transfer(mnt_userns, inode, iattr); if (rc) return rc; } diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 05efcdf7a4a7..7c849024999f 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, */ int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags) { int err; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 8c37aaf936ab..70da4c0ba7ad 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, int attr_name_len); int ksmbd_vfs_setxattr(struct user_namespace *user_ns, struct dentry *dentry, const char *attr_name, - const void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, diff --git a/fs/locks.c b/fs/locks.c index ca28e0e50e56..c266cfdc3291 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) +static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { - int type = flock_translate_cmd(cmd); - - if (type < 0) - return ERR_PTR(type); - - if (fl == NULL) { - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); - } else { - locks_init_lock(fl); - } + locks_init_lock(fl); fl->fl_file = filp; fl->fl_owner = filp; @@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - - return fl; } static int assign_type(struct file_lock *fl, long type) @@ -2097,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - struct fd f = fdget(fd); - struct file_lock *lock; - int can_sleep, unlock; - int error; - - error = -EBADF; - if (!f.file) - goto out; - - can_sleep = !(cmd & LOCK_NB); - cmd &= ~LOCK_NB; - unlock = (cmd == LOCK_UN); - - if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) - goto out_putf; + int can_sleep, error, type; + struct file_lock fl; + struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2123,36 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ if (cmd & LOCK_MAND) { pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); - error = 0; - goto out_putf; + return 0; } - lock = flock_make_lock(f.file, cmd, NULL); - if (IS_ERR(lock)) { - error = PTR_ERR(lock); + type = flock_translate_cmd(cmd & ~LOCK_NB); + if (type < 0) + return type; + + error = -EBADF; + f = fdget(fd); + if (!f.file) + return error; + + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE))) goto out_putf; - } - if (can_sleep) - lock->fl_flags |= FL_SLEEP; + flock_make_lock(f.file, &fl, type); - error = security_file_lock(f.file, lock->fl_type); + error = security_file_lock(f.file, fl.fl_type); if (error) - goto out_free; + goto out_putf; + + can_sleep = !(cmd & LOCK_NB); + if (can_sleep) + fl.fl_flags |= FL_SLEEP; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, - (can_sleep) ? F_SETLKW : F_SETLK, - lock); + (can_sleep) ? F_SETLKW : F_SETLK, + &fl); else - error = locks_lock_file_wait(f.file, lock); - - out_free: - locks_free_lock(lock); + error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); - out: + return error; } @@ -2614,7 +2593,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - flock_make_lock(filp, LOCK_UN, &fl); + flock_make_lock(filp, &fl, F_UNLCK); fl.fl_flags |= FL_CLOSE; if (filp->f_op->flock) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 4f897e109547..cd7d09a569ff 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, const void *data, int data_type, struct inode *dir) { - __u32 marks_mask = 0, marks_ignored_mask = 0; + __u32 marks_mask = 0, marks_ignore_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS | FANOTIFY_EVENT_FLAGS; const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); struct fsnotify_mark *mark; + bool ondir = event_mask & FAN_ONDIR; int type; pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", @@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, return 0; } else if (!(fid_mode & FAN_REPORT_FID)) { /* Do we have a directory inode to report? */ - if (!dir && !(event_mask & FS_ISDIR)) + if (!dir && !ondir) return 0; } fsnotify_foreach_iter_mark_type(iter_info, mark, type) { - /* Apply ignore mask regardless of mark's ISDIR flag */ - marks_ignored_mask |= mark->ignored_mask; + /* + * Apply ignore mask depending on event flags in ignore mask. + */ + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, ondir, type); /* - * If the event is on dir and this mark doesn't care about - * events on dir, don't send it! + * Send the event depending on event flags in mark mask. */ - if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + if (!fsnotify_mask_applicable(mark->mask, ondir, type)) continue; marks_mask |= mark->mask; @@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, *match_mask |= 1U << type; } - test_mask = event_mask & marks_mask & ~marks_ignored_mask; + test_mask = event_mask & marks_mask & ~marks_ignore_mask; /* * For dirent modification events (create/delete/move) that do not carry diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 80e0ec95b113..1d9f11255c64 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) mflags |= FAN_MARK_EVICTABLE; + if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + mflags |= FAN_MARK_IGNORE; return mflags; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b08ce0d821a7..f0e49a406ffa 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); - if (!(flags & FAN_MARK_IGNORED_MASK)) { + if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { - fsn_mark->ignored_mask &= ~mask; + fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* @@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, * changes to the mask. * Destroy mark when only umask bits remain. */ - *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); + *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; @@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); + unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* + * When using FAN_MARK_IGNORE for the first time, mark starts using + * independent event flags in ignore mask. After that, trying to + * update the ignore mask with the old FAN_MARK_IGNORED_MASK API + * will result in EEXIST error. + */ + if (ignore == FAN_MARK_IGNORE) + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; + + /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set - * because of an ignored mask that is now going to survive FS_MODIFY. + * because of an ignore mask that is now going to survive FS_MODIFY. */ - if ((fan_flags & FAN_MARK_IGNORED_MASK) && - (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) @@ -1120,10 +1129,10 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, bool recalc; spin_lock(&fsn_mark->lock); - if (!(fan_flags & FAN_MARK_IGNORED_MASK)) + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else - fsn_mark->ignored_mask |= mask; + fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); @@ -1187,6 +1196,37 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group) sizeof(struct fanotify_error_event)); } +static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, + unsigned int fan_flags) +{ + /* + * Non evictable mark cannot be downgraded to evictable mark. + */ + if (fan_flags & FAN_MARK_EVICTABLE && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) + return -EEXIST; + + /* + * New ignore mask semantics cannot be downgraded to old semantics. + */ + if (fan_flags & FAN_MARK_IGNORED_MASK && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) + return -EEXIST; + + /* + * An ignore mask that survives modify could never be downgraded to not + * survive modify. With new FAN_MARK_IGNORE semantics we make that rule + * explicit and return an error when trying to update the ignore mask + * without the original FAN_MARK_IGNORED_SURV_MODIFY value. + */ + if (fan_flags & FAN_MARK_IGNORE && + !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && + fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) + return -EEXIST; + + return 0; +} + static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, @@ -1208,19 +1248,18 @@ static int fanotify_add_mark(struct fsnotify_group *group, } /* - * Non evictable mark cannot be downgraded to evictable mark. + * Check if requested mark flags conflict with an existing mark flags. */ - if (fan_flags & FAN_MARK_EVICTABLE && - !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) { - ret = -EEXIST; + ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); + if (ret) goto out; - } /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ - if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && + (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; @@ -1261,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, /* * If some other task has this inode open for write we should not add - * an ignored mark, unless that ignored mark is supposed to survive + * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ - if ((flags & FAN_MARK_IGNORED_MASK) && + if ((flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && inode_is_open_for_write(inode)) return 0; @@ -1520,7 +1559,8 @@ static int fanotify_events_supported(struct fsnotify_group *group, unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || - (mask & FAN_RENAME); + (mask & FAN_RENAME) || + (flags & FAN_MARK_IGNORE); /* * Some filesystems such as 'proc' acquire unusual locks when opening @@ -1557,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; - bool ignored = flags & FAN_MARK_IGNORED_MASK; + unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; + unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; u32 umask = 0; int ret; @@ -1586,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; } - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) @@ -1606,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ~valid_mask) return -EINVAL; - /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */ - if (ignored) + + /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ + if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) + return -EINVAL; + + /* + * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with + * FAN_MARK_IGNORED_MASK. + */ + if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; + umask = FANOTIFY_EVENT_FLAGS; + } f = fdget(fanotify_fd); if (unlikely(!f.file)) @@ -1672,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; - if (flags & FAN_MARK_FLUSH) { + if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); @@ -1688,7 +1739,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto fput_and_out; - if (flags & FAN_MARK_ADD) { + if (mark_cmd == FAN_MARK_ADD) { ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; @@ -1712,6 +1763,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; + ret = mnt ? -EINVAL : -EISDIR; + /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ + if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && + (mnt || S_ISDIR(inode->i_mode)) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) + goto path_put_and_out; + /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; @@ -1721,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && - (flags & FAN_MARK_ADD) && !ignored) + (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { + switch (mark_cmd) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, @@ -1804,7 +1862,7 @@ static int __init fanotify_user_setup(void) BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 59fb40abe33d..55081ae3a6ec 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, - mflags, mark->mask, mark->ignored_mask); + mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); @@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct mount *mnt = fsnotify_conn_mount(mark->connector); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", - mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + mnt->mnt_id, mflags, mark->mask, mark->ignore_mask); } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { struct super_block *sb = fsnotify_conn_sb(mark->connector); seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", - sb->s_dev, mflags, mark->mask, mark->ignored_mask); + sb->s_dev, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 0b3e74935cb4..7974e91ffe13 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb) * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the - * parent cares. Thus when an event happens on a child it can quickly tell if + * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void __fsnotify_update_child_dentry_flags(struct inode *inode) @@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type, struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; - __u32 marks_ignored_mask = 0; + __u32 marks_ignore_mask = 0; + bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; @@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) - mark->ignored_mask = 0; + mark->ignore_mask = 0; } } @@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type, fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; - marks_ignored_mask |= mark->ignored_mask; + marks_ignore_mask |= + fsnotify_effective_ignore_mask(mark, is_dir, type); } - pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", - __func__, group, mask, marks_mask, marks_ignored_mask, + pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", + __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); - if (!(test_mask & marks_mask & ~marks_ignored_mask)) + if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { @@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types( * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && - !(mark->mask & FS_EVENT_ON_CHILD)) + !(mark->mask & FS_EVENT_ON_CHILD) && + !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); @@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * If this is a modify event we may need to clear some ignored masks. - * In that case, the object with ignored masks will have the FS_MODIFY + * If this is a modify event we may need to clear some ignore masks. + * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ed42a189faa2..1c4bfdab008d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -136,7 +136,7 @@ static inline u32 inotify_mask_to_arg(__u32 mask) IN_Q_OVERFLOW); } -/* intofiy userspace file descriptor functions */ +/* inotify userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 4de597a83b88..52615e6090e1 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, a = (ATTR_RECORD*)((u8*)ctx->attr + le32_to_cpu(ctx->attr->length)); for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { - if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + - le32_to_cpu(ctx->mrec->bytes_allocated)) + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end || + name_end > mrec_end) break; ctx->attr = a; if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7497cd592258..9c67edd215d5 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (status) return status; - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { status = dquot_initialize(inode); if (status) return status; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 337527571461..740b64238312 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -277,7 +277,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ - OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) - || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 0b0ae3ebb0cf..da7718cef735 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid || - !si->si_slots[preferred].sl_node_num) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid || - !si->si_slots[i].sl_node_num) { + if (!si->si_slots[i].sl_valid) { ret = i; break; } @@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - if (ocfs2_mount_local(osb)) - /* use slot 0 directly in local mode */ - slot = 0; - else { - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot < 0) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " - "already allocated to this node!\n", - slot, osb->dev_str); - } + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f7298816d8d9..438be028935d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -172,7 +172,6 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, - Opt_nocluster, Opt_err, }; @@ -206,7 +205,6 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, - {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } - tmp = OCFS2_MOUNT_NOCLUSTER; - if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { - ret = -EINVAL; - mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); - goto out; - } - tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && - !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); - if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && - !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) - printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " - "without cluster aware mode.\n", osb->dev_str); - atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; - case Opt_nocluster: - mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; - break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); - if (opts & OCFS2_MOUNT_NOCLUSTER) - seq_printf(s, ",nocluster"); - return 0; } diff --git a/fs/open.c b/fs/open.c index 1d57fbde2feb..2790aac66e58 100644 --- a/fs/open.c +++ b/fs/open.c @@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } +/** + * setattr_vfsuid - check and set ia_fsuid attribute + * @kuid: new inode owner + * + * Check whether @kuid is valid and if so generate and set vfsuid_t in + * ia_vfsuid. + * + * Return: true if @kuid is valid, false if not. + */ +static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) +{ + if (!uid_valid(kuid)) + return false; + attr->ia_valid |= ATTR_UID; + attr->ia_vfsuid = VFSUIDT_INIT(kuid); + return true; +} + +/** + * setattr_vfsgid - check and set ia_fsgid attribute + * @kgid: new inode owner + * + * Check whether @kgid is valid and if so generate and set vfsgid_t in + * ia_vfsgid. + * + * Return: true if @kgid is valid, false if not. + */ +static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) +{ + if (!gid_valid(kgid)) + return false; + attr->ia_valid |= ATTR_GID; + attr->ia_vfsgid = VFSGIDT_INIT(kgid); + return true; +} + int chown_common(const struct path *path, uid_t user, gid_t group) { struct user_namespace *mnt_userns, *fs_userns; @@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group) mnt_userns = mnt_user_ns(path->mnt); fs_userns = i_user_ns(inode); - uid = mapped_kuid_user(mnt_userns, fs_userns, uid); - gid = mapped_kgid_user(mnt_userns, fs_userns, gid); retry_deleg: newattrs.ia_valid = ATTR_CTIME; - if (user != (uid_t) -1) { - if (!uid_valid(uid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = uid; - } - if (group != (gid_t) -1) { - if (!gid_valid(gid)) - return -EINVAL; - newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = gid; - } + if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) + return -EINVAL; + if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) + return -EINVAL; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; inode_lock(inode); - error = security_path_chown(path, uid, gid); + /* Continue to send actual fs values, not the mount values. */ + error = security_path_chown( + path, + from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid), + from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(mnt_userns, path->dentry, &newattrs, &delegated_inode); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 714ec569d25b..245e2cb62708 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -331,8 +331,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry, if (!err) { struct iattr attr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = stat->uid, - .ia_gid = stat->gid, + .ia_vfsuid = VFSUIDT_INIT(stat->uid), + .ia_vfsgid = VFSGIDT_INIT(stat->gid), }; err = ovl_do_notify_change(ofs, upperdentry, &attr); } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 492eddeb481f..7922b619f6c8 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -454,23 +454,94 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } +/* + * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone + * of the POSIX ACLs retrieved from the lower layer to this function to not + * alter the POSIX ACLs for the underlying filesystem. + */ +static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, + struct posix_acl *acl) +{ + for (unsigned int i = 0; i < acl->a_count; i++) { + vfsuid_t vfsuid; + vfsgid_t vfsgid; + + struct posix_acl_entry *e = &acl->a_entries[i]; + switch (e->e_tag) { + case ACL_USER: + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + e->e_uid = vfsuid_into_kuid(vfsuid); + break; + case ACL_GROUP: + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + e->e_gid = vfsgid_into_kgid(vfsgid); + break; + } + } +} + +/* + * When the relevant layer is an idmapped mount we need to take the idmapping + * of the layer into account and translate any ACL_{GROUP,USER} values + * according to the idmapped mount. + * + * We cannot alter the ACLs returned from the relevant layer as that would + * alter the cached values filesystem wide for the lower filesystem. Instead we + * can clone the ACLs and then apply the relevant idmapping of the layer. + * + * This is obviously only relevant when idmapped layers are used. + */ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); - const struct cred *old_cred; - struct posix_acl *acl; + struct posix_acl *acl, *clone; + struct path realpath; if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; - if (rcu) - return get_cached_acl_rcu(realinode, type); + /* Careful in RCU walk mode */ + ovl_i_path_real(inode, &realpath); + if (!realpath.dentry) { + WARN_ON(!rcu); + return ERR_PTR(-ECHILD); + } - old_cred = ovl_override_creds(inode->i_sb); - acl = get_acl(realinode, type); - revert_creds(old_cred); + if (rcu) { + acl = get_cached_acl_rcu(realinode, type); + } else { + const struct cred *old_cred; + + old_cred = ovl_override_creds(inode->i_sb); + acl = get_acl(realinode, type); + revert_creds(old_cred); + } + /* + * If there are no POSIX ACLs, or we encountered an error, + * or the layer isn't idmapped we don't need to do anything. + */ + if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl)) + return acl; - return acl; + /* + * We only get here if the layer is idmapped. So drop out of RCU path + * walk so we can clone the ACLs. There's no need to release the ACLs + * since get_cached_acl_rcu() doesn't take a reference on the ACLs. + */ + if (rcu) + return ERR_PTR(-ECHILD); + + clone = posix_acl_clone(acl, GFP_KERNEL); + if (!clone) + clone = ERR_PTR(-ENOMEM); + else + ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + /* + * Since we're not in RCU path walk we always need to release the + * original ACLs. + */ + posix_acl_release(acl); + return clone; } int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f34b7e02eee..6ec815b84d48 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { - struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs); - struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry)); - - if (attr->ia_valid & ATTR_UID) - attr->ia_uid = mapped_kuid_user(upper_mnt_userns, - fs_userns, attr->ia_uid); - if (attr->ia_valid & ATTR_GID) - attr->ia_gid = mapped_kgid_user(upper_mnt_userns, - fs_userns, attr->ia_gid); - - return notify_change(upper_mnt_userns, upperdentry, attr, NULL); + return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, @@ -259,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags); + int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, + (void *)value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 1ce5c9698393..e0a2e0468ee7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1003,9 +1003,6 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - return ovl_xattr_get(dentry, inode, handler->name, buffer, size); } @@ -1021,9 +1018,6 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, struct posix_acl *acl = NULL; int err; - if (!IS_POSIXACL(inode)) - return -EOPNOTSUPP; - /* Check that everything is OK before copy-up */ if (value) { acl = posix_acl_from_xattr(&init_user_ns, value, size); @@ -1966,20 +1960,6 @@ static struct dentry *ovl_get_root(struct super_block *sb, return root; } -static bool ovl_has_idmapped_layers(struct ovl_fs *ofs) -{ - - unsigned int i; - const struct vfsmount *mnt; - - for (i = 0; i < ofs->numlayer; i++) { - mnt = ofs->layers[i].mnt; - if (mnt && is_idmapped_mnt(mnt)) - return true; - } - return false; -} - static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { }; @@ -2149,10 +2129,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; - if (ovl_has_idmapped_layers(ofs)) - pr_warn("POSIX ACLs are not yet supported with idmapped layers, mounting without ACL support.\n"); - else - sb->s_flags |= SB_POSIXACL; + sb->s_flags |= SB_POSIXACL; sb->s_iflags |= SB_I_SKIP_SYNC; err = -ENOMEM; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 962d32468eb4..1d17d7b13dcd 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ -static struct posix_acl * +struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; @@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) } return clone; } +EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. @@ -361,8 +362,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; - kuid_t uid; - kgid_t gid; + vfsuid_t vfsuid; + vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -370,30 +371,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - uid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(uid, current_fsuid())) + vfsuid = i_uid_into_vfsuid(mnt_userns, inode); + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: - uid = mapped_kuid_fs(mnt_userns, - i_user_ns(inode), + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, pa->e_uid); - if (uid_eq(uid, current_fsuid())) + if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - gid = i_gid_into_mnt(mnt_userns, inode); - if (in_group_p(gid)) { + vfsgid = i_gid_into_vfsgid(mnt_userns, inode); + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - gid = mapped_kgid_fs(mnt_userns, - i_user_ns(inode), + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, pa->e_gid); - if (in_group_p(gid)) { + if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -699,7 +698,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns, return error; if (error == 0) *acl = NULL; - if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; @@ -710,46 +709,127 @@ EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - struct user_namespace *mnt_userns, - void *value, size_t size, bool from_user) +static int posix_acl_fix_xattr_common(void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + int count; + + if (!header) + return -EINVAL; + if (size < sizeof(struct posix_acl_xattr_header)) + return -EINVAL; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return -EINVAL; + + count = posix_acl_xattr_count(size); + if (count < 0) + return -EINVAL; + if (count == 0) + return -EINVAL; + + return count; +} + +void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; kuid_t uid; kgid_t gid; - if (!value) + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - if (size < sizeof(struct posix_acl_xattr_header)) + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, + vfsuid_into_kuid(vfsuid))); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, + vfsgid_into_kgid(vfsgid))); + break; + default: + break; + } + } +} + +void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, + const struct inode *inode, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + vfsuid_t vfsuid; + vfsgid_t vfsgid; + kuid_t uid; + kgid_t gid; + + if (no_idmapping(mnt_userns, i_user_ns(inode))) return; - count = posix_acl_xattr_count(size); + count = posix_acl_fix_xattr_common(value, size); if (count < 0) return; - if (count == 0) + + for (end = entry + count; entry != end; entry++) { + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsuid = VFSUIDT_INIT(uid); + uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); + break; + case ACL_GROUP: + gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); + vfsgid = VFSGIDT_INIT(gid); + gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); + break; + default: + break; + } + } +} + +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + struct posix_acl_xattr_header *header = value; + struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + int count; + kuid_t uid; + kgid_t gid; + + count = posix_acl_fix_xattr_common(value, size); + if (count < 0) return; for (end = entry + count; entry != end; entry++) { switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); - if (from_user) - uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid); - else - uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); - if (from_user) - gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid); - else - gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -758,34 +838,20 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_from_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, - size, true); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); } -void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, - struct inode *inode, - void *value, size_t size) +void posix_acl_fix_xattr_to_user(void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - - /* Leave ids untouched on non-idmapped mounts. */ - if (no_idmapping(mnt_userns, i_user_ns(inode))) - mnt_userns = &init_user_ns; - if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) + if (user_ns == &init_user_ns) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, - size, false); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); } /* diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 09d1307959d0..28966da7834e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ -int dquot_transfer(struct inode *inode, struct iattr *iattr) +int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode, + struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; @@ -2095,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) if (!dquot_active(inode)) return 0; - if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){ - dquot = dqget(sb, make_kqid_uid(iattr->ia_uid)); + if (i_uid_needs_update(mnt_userns, iattr, inode)) { + kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); + + dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); @@ -2106,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } transfer_to[USRQUOTA] = dquot; } - if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){ - dquot = dqget(sb, make_kqid_gid(iattr->ia_gid)); + if (i_gid_needs_update(mnt_userns, iattr, inode)) { + kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); + + dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); diff --git a/fs/read_write.c b/fs/read_write.c index e0777eefd846..0131d0df0476 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1263,6 +1263,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, count, fl); file_end_write(out.file); } else { + if (out.file->f_flags & O_NONBLOCK) + fl |= SPLICE_F_NONBLOCK; + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } @@ -1660,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && + !((iocb->ki_flags & IOCB_DIRECT) || + (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0cffe054b78e..0df48d176732 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, struct buffer_head *bh; struct item_head *ih, tmp_ih; b_blocknr_t blocknr; - char *p = NULL; + char *p; int chars; int ret; int result; @@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, result = search_for_position_by_key(inode->i_sb, &key, &path); if (result != POSITION_FOUND) { pathrelse(&path); - if (p) - kunmap(bh_result->b_page); if (result == IO_ERROR) return -EIO; /* @@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, } pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return ret; } /* requested data are in direct item(s) */ @@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * when it is stored in direct item(s) */ pathrelse(&path); - if (p) - kunmap(bh_result->b_page); return -ENOENT; } @@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block, * sure we need to. But, this means the item might move if * kmap schedules */ - if (!p) - p = (char *)kmap(bh_result->b_page); - + p = (char *)kmap(bh_result->b_page); p += offset; memset(p, 0, inode->i_sb->s_blocksize); do { @@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - if (is_quota_modification(inode, attr)) { + if (is_quota_modification(mnt_userns, inode, attr)) { error = dquot_initialize(inode); if (error) return error; @@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, reiserfs_write_unlock(inode->i_sb); if (error) goto out; - error = dquot_transfer(inode, attr); + error = dquot_transfer(mnt_userns, inode, attr); reiserfs_write_lock(inode->i_sb); if (error) { journal_end(&th); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..de86f5b2859f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg) } static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; + msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; - if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) - address &= PAGE_MASK; - msg.arg.pagefault.address = address; + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. @@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, - ctx->features); + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/fs/xattr.c b/fs/xattr.c index e8dd03e4561e..a1f4998bc6be 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -282,9 +282,15 @@ out: } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); +static inline bool is_posix_acl_xattr(const char *name) +{ + return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || + (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); +} + int vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) + const char *name, void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(mnt_userns, dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, + (const void **)&value, size); if (error < 0) return error; size = error; } + if (size && is_posix_acl_xattr(name)) + posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size); + retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, @@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, return ret; } nolsm: - return __vfs_getxattr(dentry, inode, name, value, size); + error = __vfs_getxattr(dentry, inode, name, value, size); + if (error > 0 && is_posix_acl_xattr(name)) + posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size); + return error; } EXPORT_SYMBOL_GPL(vfs_getxattr); @@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns, if (ctx->size && ((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))) - posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d), - ctx->kvalue, ctx->size); + posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size); } int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, @@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d, if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d), - ctx->kvalue, error); + posix_acl_fix_xattr_to_user(ctx->kvalue, error); if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5a171c0b244b..8d9b14d2b912 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -410,7 +410,7 @@ restart: spin_unlock(&ip->i_flags_lock); out: - return file_modified(file); + return kiocb_modified(iocb); } static int @@ -700,12 +700,11 @@ xfs_file_buffered_write( bool cleared_space = false; unsigned int iolock; - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; - write_retry: iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) @@ -1165,7 +1164,7 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a393259a3a3..5d50fed291b4 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -664,7 +664,7 @@ xfs_ilock_for_iomap( unsigned flags, unsigned *lockmode) { - unsigned mode = XFS_ILOCK_SHARED; + unsigned int mode = *lockmode; bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* @@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin( int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin( bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; + unsigned int lockmode = XFS_ILOCK_EXCL; if (xfs_is_shutdown(mp)) return -EIO; @@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); - xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { @@ -1172,7 +1175,7 @@ xfs_read_iomap_begin( xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 29f5b8b8aca6..a7402f6ea510 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -667,13 +667,15 @@ xfs_setattr_nonsize( uint qflags = 0; if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = iattr->ia_uid; + uid = from_vfsuid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsuid); qflags |= XFS_QMOPT_UQUOTA; } else { uid = inode->i_uid; } if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = iattr->ia_gid; + gid = from_vfsgid(mnt_userns, i_user_ns(inode), + iattr->ia_vfsgid); qflags |= XFS_QMOPT_GQUOTA; } else { gid = inode->i_gid; @@ -704,13 +706,13 @@ xfs_setattr_nonsize( * didn't have the inode locked, inode's dquot(s) would have changed * also. */ - if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && - !uid_eq(inode->i_uid, iattr->ia_uid)) { + if (XFS_IS_UQUOTA_ON(mp) && + i_uid_needs_update(mnt_userns, iattr, inode)) { ASSERT(udqp); old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && - !gid_eq(inode->i_gid, iattr->ia_gid)) { + if (XFS_IS_GQUOTA_ON(mp) && + i_gid_needs_update(mnt_userns, iattr, inode)) { ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(gdqp); old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 053299758deb..f5d8338967cb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns, !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(inode, iattr); + ret = dquot_transfer(mnt_userns, inode, iattr); if (ret) return ret; } |