From 8603b6f58637ce196d68f7749633ea81af196d66 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 3 Sep 2022 08:43:30 +0200 Subject: core_pattern: add CPU specifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Statistically, in a large deployment regular segfaults may indicate a CPU issue. Currently, it is not possible to find out what CPU the segfault happened on. There are at least two attempts to improve segfault logging with this regard, but they do not help in case the logs rotate. Hence, lets make sure it is possible to permanently record a CPU the task ran on using a new core_pattern specifier. Link: https://lkml.kernel.org/r/20220903064330.20772-1-oleksandr@redhat.com Signed-off-by: Oleksandr Natalenko Suggested-by: Renaud Métrich Reviewed-by: Oleg Nesterov Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Grzegorz Halat Cc: "Guilherme G. Piccoli" Cc: "Huang, Ying" Cc: Jason A. Donenfeld Cc: Joel Savitz Cc: Jonathan Corbet Cc: Kees Cook Cc: Laurent Dufour Cc: Luis Chamberlain Cc: Rob Herring Cc: Stephen Kitt Cc: Will Deacon Cc: Xiaoming Ni Signed-off-by: Andrew Morton --- fs/coredump.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 7bad7785e8e6..3e8630c8d627 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* CPU the task ran on */ + case 'C': + err = cn_printf(cn, "%d", cprm->cpu); + break; default: break; } @@ -534,6 +538,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) */ .mm_flags = mm->flags, .vma_meta = NULL, + .cpu = raw_smp_processor_id(), }; audit_core_dumps(siginfo->si_signo); -- cgit v1.2.3 From 71dd5d651be7c99c401ad642e9f3cba88b956bce Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2022 20:48:44 +0800 Subject: ocfs2/cluster: use bitmap API instead of hand-writing it Use bitmap_zero/bitmap_copy/bitmap_equal directly for bitmap operations. Link: https://lkml.kernel.org/r/20221007124846.186453-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/heartbeat.c | 20 ++++++++++---------- fs/ocfs2/cluster/nodemanager.c | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b13d344d40b6..8fe6031f60e3 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg) /* negotiate timeout must be less than write timeout. */ schedule_delayed_work(®->hr_nego_timeout_work, msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); - memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); + bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES); } static void o2hb_disarm_timeout(struct o2hb_region *reg) @@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work) config_item_name(®->hr_item), reg->hr_bdev); set_bit(master_node, reg->hr_nego_node_bitmap); } - if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, - sizeof(reg->hr_nego_node_bitmap))) { + if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, + O2NM_MAX_NODES)) { /* check negotiate bitmap every second to do timeout * approve decision. */ @@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * live nodes heartbeat on it. In other words, the region has been * added to all nodes. */ - if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, - sizeof(o2hb_live_node_bitmap))) + if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + O2NM_MAX_NODES)) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", @@ -1437,11 +1437,11 @@ void o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); - memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); - memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); - memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES); + bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); o2hb_dependent_users = 0; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 27fee68f860a..2f61d39e4e50 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes) return -EINVAL; read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES); read_unlock(&cluster->cl_nodes_lock); return 0; -- cgit v1.2.3 From 6d4a93b6809270e5c7d7216b20f4ef2e88213eb3 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2022 20:48:45 +0800 Subject: ocfs2: use bitmap API in fill_node_map Pass bits directly into fill_node_map helper and use bitmap API directly to simplify code. Link: https://lkml.kernel.org/r/20221007124846.186453-2-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/heartbeat.c | 18 ++++++++---------- fs/ocfs2/cluster/heartbeat.h | 2 +- fs/ocfs2/cluster/netdebug.c | 2 +- fs/ocfs2/cluster/tcp.c | 6 ++---- fs/ocfs2/dlm/dlmdomain.c | 2 +- fs/ocfs2/stack_o2cb.c | 6 +++--- 6 files changed, 16 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 8fe6031f60e3..60b97c92e2b2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (reg->hr_last_hb_status) return; - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); /* lowest node as master node to make negotiate decision. */ master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); @@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * If a node is not configured but is in the livemap, we still need * to read the slot so as to be able to remove it from the livemap. */ - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); i = -1; while ((i = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { @@ -1450,23 +1450,21 @@ void o2hb_init(void) /* if we're already in a callback then we're already serialized by the sem */ static void o2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) + unsigned int bits) { - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &o2hb_live_node_bitmap, bytes); + bitmap_copy(map, o2hb_live_node_bitmap, bits); } /* * get a map of all nodes that are heartbeating in any regions */ -void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +void o2hb_fill_node_map(unsigned long *map, unsigned int bits) { /* callers want to serialize this map and callbacks so that they * can trust that they don't miss nodes coming to the party */ down_read(&o2hb_callback_sem); spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(map, bytes); + o2hb_fill_node_map_from_callback(map, bits); spin_unlock(&o2hb_live_lock); up_read(&o2hb_callback_sem); } @@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num) unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, @@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 1d4100abf6f8..8ef8c1b9eeb7 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid, void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, - unsigned bytes); + unsigned int bits); void o2hb_exit(void); void o2hb_init(void); int o2hb_check_node_heartbeating_no_sem(u8 node_num); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 7524994e3199..35c05c18de59 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len) unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; int i = -1, out = 0; - o2net_fill_node_map(map, sizeof(map)); + o2net_fill_node_map(map, O2NM_MAX_NODES); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index f660c0dbdb63..6f5a3fb97c7f 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, } /* Get a map of all nodes to which this node is currently connected to */ -void o2net_fill_node_map(unsigned long *map, unsigned bytes) +void o2net_fill_node_map(unsigned long *map, unsigned int bits) { struct o2net_sock_container *sc; int node, ret; - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); + bitmap_zero(map, bits); for (node = 0; node < O2NM_MAX_NODES; ++node) { if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) continue; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c4eccd499db8..f46941ff665d 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1604,7 +1604,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) /* group sem locking should work for us here -- we're already * registered for heartbeat events so filling this should be * atomic wrt getting those handlers called. */ - o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 88f75f7f02d7..c973c03f6fd8 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -273,17 +273,17 @@ static int o2cb_cluster_check(void) */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { - o2hb_fill_node_map(hbmap, sizeof(hbmap)); + o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } - o2net_fill_node_map(netmap, sizeof(netmap)); + o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); - if (!memcmp(hbmap, netmap, sizeof(hbmap))) + if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); -- cgit v1.2.3 From b270f492dc45b59d573cd80795e8b4781e959836 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 7 Oct 2022 20:48:46 +0800 Subject: ocfs2/dlm: use bitmap API instead of hand-writing it Use bitmap_zero/bitmap_copy/bitmap_qeual directly for bitmap operations. Link: https://lkml.kernel.org/r/20221007124846.186453-3-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmcommon.h | 2 +- fs/ocfs2/dlm/dlmdomain.c | 17 +++++++---------- fs/ocfs2/dlm/dlmmaster.c | 30 +++++++++++++++--------------- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 4 files changed, 24 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fd2022712167..20f790a47484 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err) static inline void dlm_node_iter_init(unsigned long *map, struct dlm_node_iter *iter) { - memcpy(iter->node_map, map, sizeof(iter->node_map)); + bitmap_copy(iter->node_map, map, O2NM_MAX_NODES); iter->curnode = -1; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index f46941ff665d..5c04dde99981 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); /* For now, we restart the process if the node maps have * changed at all */ - ret = memcmp(ctxt->live_map, dlm->live_nodes_map, - sizeof(dlm->live_nodes_map)); + ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map, + O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); if (ret) @@ -1607,10 +1607,8 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES); spin_lock(&dlm->spinlock); - memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); - + bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES); __dlm_set_joining_node(dlm, dlm->node_num); - spin_unlock(&dlm->spinlock); node = -1; @@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) * yes_resp_map. Copy that into our domain map and send a join * assert message to clean up everyone elses state. */ spin_lock(&dlm->spinlock); - memcpy(dlm->domain_map, ctxt->yes_resp_map, - sizeof(ctxt->yes_resp_map)); + bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES); set_bit(dlm->node_num, dlm->domain_map); spin_unlock(&dlm->spinlock); @@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", dlm->recovery_map, &(dlm->recovery_map[0])); - memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); - memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); - memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES); + bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES); + bitmap_zero(dlm->domain_map, O2NM_MAX_NODES); dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 227da5b1b6ab..d610da8e2f24 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, mle->type = type; INIT_HLIST_NODE(&mle->master_hash_node); INIT_LIST_HEAD(&mle->hb_events); - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); spin_lock_init(&mle->spinlock); init_waitqueue_head(&mle->wq); atomic_set(&mle->woken, 0); kref_init(&mle->mle_refs); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; mle->inuse = 0; @@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, atomic_inc(&dlm->mle_cur_count[mle->type]); /* copy off the node_map and register hb callbacks on our copy */ - memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); - memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES); + bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES); clear_bit(dlm->node_num, mle->vote_map); clear_bit(dlm->node_num, mle->node_map); @@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, spin_unlock(&dlm->track_lock); memset(res->lvb, 0, DLM_LVB_LEN); - memset(res->refmap, 0, sizeof(res->refmap)); + bitmap_zero(res->refmap, O2NM_MAX_NODES); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -1036,10 +1036,10 @@ recheck: spin_lock(&mle->spinlock); m = mle->master; - map_changed = (memcmp(mle->vote_map, mle->node_map, - sizeof(mle->vote_map)) != 0); - voting_done = (memcmp(mle->vote_map, mle->response_map, - sizeof(mle->vote_map)) == 0); + map_changed = !bitmap_equal(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); + voting_done = bitmap_equal(mle->vote_map, mle->response_map, + O2NM_MAX_NODES); /* restart if we hit any errors */ if (map_changed) { @@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* now blank out everything, as if we had never * contacted anyone */ - memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); - memset(mle->response_map, 0, sizeof(mle->response_map)); + bitmap_zero(mle->maybe_map, O2NM_MAX_NODES); + bitmap_zero(mle->response_map, O2NM_MAX_NODES); /* reset the vote_map to the current node_map */ - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); + bitmap_copy(mle->vote_map, mle->node_map, + O2NM_MAX_NODES); /* put myself into the maybe map */ if (mle->type != DLM_MLE_BLOCK) set_bit(dlm->node_num, mle->maybe_map); @@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) flags = item->u.am.flags; spin_lock(&dlm->spinlock); - memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES); spin_unlock(&dlm->spinlock); clear_bit(dlm->node_num, nodemap); @@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ret = 0; } - memset(iter.node_map, 0, sizeof(iter.node_map)); + bitmap_zero(iter.node_map, O2NM_MAX_NODES); set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 52ad342fec3e..50da8af988c1 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_reco_node_data *ndata; spin_lock(&dlm->spinlock); - memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES); /* nodes can only be removed (by dying) after dropping * this lock, and death will be trapped later, so this should do */ spin_unlock(&dlm->spinlock); -- cgit v1.2.3 From 12b9d301ff73122aebd78548fa4c04ca69ed78fe Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Thu, 29 Sep 2022 12:29:33 +0800 Subject: proc/vmcore: fix potential memory leak in vmcore_init() Patch series "Some minor cleanup patches resent". The first three patches trivial clean up patches. And for the patch "kexec: replace crash_mem_range with range", I got a ibm-p9wr ppc64le system to test, it works well. This patch (of 4): elfcorehdr_alloc() allocates a memory chunk for elfcorehdr_addr with kzalloc(). If is_vmcore_usable() returns false, elfcorehdr_addr is a predefined value. If parse_crash_elf_headers() gets some error and returns a negetive value, the elfcorehdr_addr should be released with elfcorehdr_free(). Fix it by calling elfcorehdr_free() when parse_crash_elf_headers() fails. Link: https://lkml.kernel.org/r/20220929042936.22012-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220929042936.22012-2-bhe@redhat.com Signed-off-by: Jianglei Nie Signed-off-by: Baoquan He Acked-by: Baoquan He Cc: Benjamin Herrenschmidt Cc: Chen Lifu Cc: "Eric W . Biederman" Cc: Li Chen Cc: Michael Ellerman Cc: Paul Mackerras Cc: Petr Mladek Cc: Russell King Cc: ye xingchen Cc: Zeal Robot Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f2aa86c421f2..74747571d58e 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1567,6 +1567,7 @@ static int __init vmcore_init(void) return rc; rc = parse_crash_elf_headers(); if (rc) { + elfcorehdr_free(elfcorehdr_addr); pr_warn("Kdump: vmcore not initialized\n"); return rc; } -- cgit v1.2.3 From f1f1f2569901ec5b9d425f2e91c09a0e320768f3 Mon Sep 17 00:00:00 2001 From: Ivan Babrou Date: Thu, 22 Sep 2022 15:40:26 -0700 Subject: proc: report open files as size in stat() for /proc/pid/fd Many monitoring tools include open file count as a metric. Currently the only way to get this number is to enumerate the files in /proc/pid/fd. The problem with the current approach is that it does many things people generally don't care about when they need one number for a metric. In our tests for cadvisor, which reports open file counts per cgroup, we observed that reading the number of open files is slow. Out of 35.23% of CPU time spent in `proc_readfd_common`, we see 29.43% spent in `proc_fill_cache`, which is responsible for filling dentry info. Some of this extra time is spinlock contention, but it's a contention for the lock we don't want to take to begin with. We considered putting the number of open files in /proc/pid/status. Unfortunately, counting the number of fds involves iterating the open_files bitmap, which has a linear complexity in proportion with the number of open files (bitmap slots really, but it's close). We don't want to make /proc/pid/status any slower, so instead we put this info in /proc/pid/fd as a size member of the stat syscall result. Previously the reported number was zero, so there's very little risk of breaking anything, while still providing a somewhat logical way to count the open files with a fallback if it's zero. RFC for this patch included iterating open fds under RCU. Thanks to Frank Hofmann for the suggestion to use the bitmap instead. Previously: ``` $ sudo stat /proc/1/fd | head -n2 File: /proc/1/fd Size: 0 Blocks: 0 IO Block: 1024 directory ``` With this patch: ``` $ sudo stat /proc/1/fd | head -n2 File: /proc/1/fd Size: 65 Blocks: 0 IO Block: 1024 directory ``` Correctness check: ``` $ sudo ls /proc/1/fd | wc -l 65 ``` I added the docs for /proc//fd while I'm at it. [ivan@cloudflare.com: use bitmap_weight() to count the bits] Link: https://lkml.kernel.org/r/20221018045844.37697-1-ivan@cloudflare.com [akpm@linux-foundation.org: include linux/bitmap.h for bitmap_weight()] [ivan@cloudflare.com: return errno from proc_fd_getattr() instead of setting negative size] Link: https://lkml.kernel.org/r/20221024173140.30673-1-ivan@cloudflare.com Link: https://lkml.kernel.org/r/20220922224027.59266-1-ivan@cloudflare.com Signed-off-by: Ivan Babrou Cc: Alexey Dobriyan Cc: Al Viro Cc: Christoph Anton Mitterer Cc: David Hildenbrand Cc: David Laight Cc: Ivan Babrou Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kalesh Singh Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 17 ++++++++++++++ fs/proc/fd.c | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'fs') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 898c99eae8e4..ec6cfdf1796a 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -47,6 +47,7 @@ fixes/update part 1.1 Stefani Seibold June 9 2009 3.10 /proc//timerslack_ns - Task timerslack value 3.11 /proc//patch_state - Livepatch patch operation state 3.12 /proc//arch_status - Task architecture specific information + 3.13 /proc//fd - List of symlinks to open files 4 Configuring procfs 4.1 Mount options @@ -2149,6 +2150,22 @@ AVX512_elapsed_ms the task is unlikely an AVX512 user, but depends on the workload and the scheduling scenario, it also could be a false negative mentioned above. +3.13 /proc//fd - List of symlinks to open files +------------------------------------------------------- +This directory contains symbolic links which represent open files +the process is maintaining. Example output:: + + lr-x------ 1 root root 64 Sep 20 17:53 0 -> /dev/null + l-wx------ 1 root root 64 Sep 20 17:53 1 -> /dev/null + lrwx------ 1 root root 64 Sep 20 17:53 10 -> 'socket:[12539]' + lrwx------ 1 root root 64 Sep 20 17:53 11 -> 'socket:[12540]' + lrwx------ 1 root root 64 Sep 20 17:53 12 -> 'socket:[12542]' + +The number of open files for the process is stored in 'size' member +of stat() output for /proc//fd for fast access. +------------------------------------------------------- + + Chapter 4: Configuring procfs ============================= diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36..fc46d6fe080c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -279,6 +280,30 @@ out: return 0; } +static int proc_readfd_count(struct inode *inode, loff_t *count) +{ + struct task_struct *p = get_proc_task(inode); + struct fdtable *fdt; + + if (!p) + return -ENOENT; + + task_lock(p); + if (p->files) { + rcu_read_lock(); + + fdt = files_fdtable(p->files); + *count = bitmap_weight(fdt->open_fds, fdt->max_fds); + + rcu_read_unlock(); + } + task_unlock(p); + + put_task_struct(p); + + return 0; +} + static int proc_readfd(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); @@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns, return rv; } +static int proc_fd_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + int rv = 0; + + generic_fillattr(&init_user_ns, inode, stat); + + /* If it's a directory, put the number of open fds there */ + if (S_ISDIR(inode->i_mode)) { + rv = proc_readfd_count(inode, &stat->size); + if (rv < 0) + return rv; + } + + return rv; +} + const struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, .permission = proc_fd_permission, + .getattr = proc_fd_getattr, .setattr = proc_setattr, }; -- cgit v1.2.3 From 941baf6febaa88c057192084ca280d976c7c7239 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 8 Sep 2022 21:21:54 +0300 Subject: proc: give /proc/cmdline size Most /proc files don't have length (in fstat sense). This leads to inefficiencies when reading such files with APIs commonly found in modern programming languages. They open file, then fstat descriptor, get st_size == 0 and either assume file is empty or start reading without knowing target size. cat(1) does OK because it uses large enough buffer by default. But naive programs copy-pasted from SO aren't: let mut f = std::fs::File::open("/proc/cmdline").unwrap(); let mut buf: Vec = Vec::new(); f.read_to_end(&mut buf).unwrap(); will result in openat(AT_FDCWD, "/proc/cmdline", O_RDONLY|O_CLOEXEC) = 3 statx(0, NULL, AT_STATX_SYNC_AS_STAT, STATX_ALL, NULL) = -1 EFAULT (Bad address) statx(3, "", AT_STATX_SYNC_AS_STAT|AT_EMPTY_PATH, STATX_ALL, {stx_mask=STATX_BASIC_STATS|STATX_MNT_ID, stx_attributes=0, stx_mode=S_IFREG|0444, stx_size=0, ...}) = 0 lseek(3, 0, SEEK_CUR) = 0 read(3, "BOOT_IMAGE=(hd3,gpt2)/vmlinuz-5.", 32) = 32 read(3, "19.6-100.fc35.x86_64 root=/dev/m", 32) = 32 read(3, "apper/fedora_localhost--live-roo"..., 64) = 64 read(3, "ocalhost--live-swap rd.lvm.lv=fe"..., 128) = 116 read(3, "", 12) open/stat is OK, lseek looks silly but there are 3 unnecessary reads because Rust starts with 32 bytes per Vec and grows from there. In case of /proc/cmdline, the length is known precisely. Make variables readonly while I'm at it. P.S.: I tried to scp /proc/cpuinfo today and got empty file but this is separate story. Link: https://lkml.kernel.org/r/YxoywlbM73JJN3r+@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/cmdline.c | 6 +++++- include/linux/init.h | 1 + init/main.c | 7 +++++-- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index fa762c5fbcb2..91fe1597af7b 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -3,6 +3,7 @@ #include #include #include +#include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { @@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v) static int __init proc_cmdline_init(void) { - proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init); diff --git a/include/linux/init.h b/include/linux/init.h index 077d7f93b402..2e96756fe1ff 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -143,6 +143,7 @@ struct file_system_type; extern int do_one_initcall(initcall_t fn); extern char __initdata boot_command_line[]; extern char *saved_command_line; +extern unsigned int saved_command_line_len; extern unsigned int reset_devices; /* used by init/main.c */ diff --git a/init/main.c b/init/main.c index aa21add5f7c5..d213371cc067 100644 --- a/init/main.c +++ b/init/main.c @@ -145,7 +145,8 @@ void (*__initdata late_time_init)(void); /* Untouched command line saved by arch-specific code. */ char __initdata boot_command_line[COMMAND_LINE_SIZE]; /* Untouched saved command line (eg. for /proc) */ -char *saved_command_line; +char *saved_command_line __ro_after_init; +unsigned int saved_command_line_len __ro_after_init; /* Command line for parameter parsing */ static char *static_command_line; /* Untouched extra command line */ @@ -667,6 +668,8 @@ static void __init setup_command_line(char *command_line) strcpy(saved_command_line + len, extra_init_args); } } + + saved_command_line_len = strlen(saved_command_line); } /* @@ -1379,7 +1382,7 @@ static void __init do_initcall_level(int level, char *command_line) static void __init do_initcalls(void) { int level; - size_t len = strlen(saved_command_line) + 1; + size_t len = saved_command_line_len + 1; char *command_line; command_line = kzalloc(len, GFP_KERNEL); -- cgit v1.2.3 From 610a2a3d7d8be3537458a378ec69396a76c385b6 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 27 Oct 2022 13:43:05 +0900 Subject: nilfs2: fix shift-out-of-bounds/overflow in nilfs_sb2_bad_offset() Patch series "nilfs2: fix UBSAN shift-out-of-bounds warnings on mount time". The first patch fixes a bug reported by syzbot, and the second one fixes the remaining bug of the same kind. Although they are triggered by the same super block data anomaly, I divided it into the above two because the details of the issues and how to fix it are different. Both are required to eliminate the shift-out-of-bounds issues at mount time. This patch (of 2): If the block size exponent information written in an on-disk superblock is corrupted, nilfs_sb2_bad_offset helper function can trigger shift-out-of-bounds warning followed by a kernel panic (if panic_on_warn is set): shift exponent 38983 is too large for 64-bit type 'unsigned long long' Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 ubsan_epilogue lib/ubsan.c:151 [inline] __ubsan_handle_shift_out_of_bounds+0x33d/0x3b0 lib/ubsan.c:322 nilfs_sb2_bad_offset fs/nilfs2/the_nilfs.c:449 [inline] nilfs_load_super_block+0xdf5/0xe00 fs/nilfs2/the_nilfs.c:523 init_nilfs+0xb7/0x7d0 fs/nilfs2/the_nilfs.c:577 nilfs_fill_super+0xb1/0x5d0 fs/nilfs2/super.c:1047 nilfs_mount+0x613/0x9b0 fs/nilfs2/super.c:1317 ... In addition, since nilfs_sb2_bad_offset() performs multiplication without considering the upper bound, the computation may overflow if the disk layout parameters are not normal. This fixes these issues by inserting preliminary sanity checks for those parameters and by converting the comparison from one involving multiplication and left bit-shifting to one using division and right bit-shifting. Link: https://lkml.kernel.org/r/20221027044306.42774-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20221027044306.42774-2-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+e91619dd4c11c4960706@syzkaller.appspotmail.com Tested-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 3b4a079c9617..d588816fdf2f 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "nilfs.h" #include "segment.h" @@ -443,11 +444,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) return crc == le32_to_cpu(sbp->s_sum); } -static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +/** + * nilfs_sb2_bad_offset - check the location of the second superblock + * @sbp: superblock raw data buffer + * @offset: byte offset of second superblock calculated from device size + * + * nilfs_sb2_bad_offset() checks if the position on the second + * superblock is valid or not based on the filesystem parameters + * stored in @sbp. If @offset points to a location within the segment + * area, or if the parameters themselves are not normal, it is + * determined to be invalid. + * + * Return Value: true if invalid, false if valid. + */ +static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) { - return offset < ((le64_to_cpu(sbp->s_nsegments) * - le32_to_cpu(sbp->s_blocks_per_segment)) << - (le32_to_cpu(sbp->s_log_block_size) + 10)); + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + u64 nsegments = le64_to_cpu(sbp->s_nsegments); + u64 index; + + if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS || + shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS) + return true; + + index = offset >> (shift_bits + BLOCK_SIZE_BITS); + do_div(index, blocks_per_segment); + return index < nsegments; } static void nilfs_release_super_block(struct the_nilfs *nilfs) -- cgit v1.2.3 From ebeccaaef67a4895d2496ab8d9c2fb8d89201211 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 27 Oct 2022 13:43:06 +0900 Subject: nilfs2: fix shift-out-of-bounds due to too large exponent of block size If field s_log_block_size of superblock data is corrupted and too large, init_nilfs() and load_nilfs() still can trigger a shift-out-of-bounds warning followed by a kernel panic (if panic_on_warn is set): shift exponent 38973 is too large for 32-bit type 'int' Call Trace: dump_stack_lvl+0xcd/0x134 ubsan_epilogue+0xb/0x50 __ubsan_handle_shift_out_of_bounds.cold.12+0x17b/0x1f5 init_nilfs.cold.11+0x18/0x1d [nilfs2] nilfs_mount+0x9b5/0x12b0 [nilfs2] ... This fixes the issue by adding and using a new helper function for getting block size with sanity check. Link: https://lkml.kernel.org/r/20221027044306.42774-3-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d588816fdf2f..20ff02b4ef5d 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -193,6 +193,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_get_blocksize - get block size from raw superblock data + * @sb: super block instance + * @sbp: superblock raw data buffer + * @blocksize: place to store block size + * + * nilfs_get_blocksize() calculates the block size from the block size + * exponent information written in @sbp and stores it in @blocksize, + * or aborts with an error message if it's too large. + * + * Return Value: On success, 0 is returned. If the block size is too + * large, -EINVAL is returned. + */ +static int nilfs_get_blocksize(struct super_block *sb, + struct nilfs_super_block *sbp, int *blocksize) +{ + unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size); + + if (unlikely(shift_bits > + ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) { + nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB", + shift_bits); + return -EINVAL; + } + *blocksize = BLOCK_SIZE << shift_bits; + return 0; +} + /** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released @@ -246,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); /* verify consistency between two super blocks */ - blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + err = nilfs_get_blocksize(sb, sbp[0], &blocksize); + if (err) + goto scan_error; + if (blocksize != nilfs->ns_blocksize) { nilfs_warn(sb, "blocksize differs between two super blocks (%d != %d)", blocksize, nilfs->ns_blocksize); + err = -EINVAL; goto scan_error; } @@ -609,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; - blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); - if (blocksize < NILFS_MIN_BLOCK_SIZE || - blocksize > NILFS_MAX_BLOCK_SIZE) { + err = nilfs_get_blocksize(sb, sbp, &blocksize); + if (err) + goto failed_sbh; + + if (blocksize < NILFS_MIN_BLOCK_SIZE) { nilfs_err(sb, "couldn't mount because of unsupported filesystem blocksize %d", blocksize); -- cgit v1.2.3 From 80f784098ff44e086f68f0e8c98b6c6da8702ec4 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 19 Oct 2022 11:09:29 +0800 Subject: squashfs: add the mount parameter theads= Patch series 'squashfs: Add the mount parameter "threads="'. Currently, Squashfs supports multiple decompressor parallel modes. However, this mode can be configured only during kernel building and does not support flexible selection during runtime. In the current patch set, the mount parameter "threads=" is added to allow users to select the parallel decompressor mode and configure the number of decompressors when mounting a file system. "threads=" The upper limit is num_online_cpus() * 2. This patch (of 2): Squashfs supports three decompression concurrency modes: Single-thread mode: concurrent reads are blocked and the memory overhead is small. Multi-thread mode/percpu mode: reduces concurrent read blocking but increases memory overhead. The corresponding schema must be fixed at compile time. During mounting, the concurrent decompression mode cannot be adjusted based on file read blocking. The mount parameter theads= is added to select the concurrent decompression mode of a single SquashFS file system image. Link: https://lkml.kernel.org/r/20221019030930.130456-1-nixiaoming@huawei.com Link: https://lkml.kernel.org/r/20221019030930.130456-2-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Phillip Lougher Cc: Jianguo Chen Cc: Jubin Zhong Cc: Zhang Yi Signed-off-by: Andrew Morton --- fs/squashfs/Kconfig | 39 ++++++++++++++++++++--- fs/squashfs/block.c | 2 +- fs/squashfs/decompressor.c | 2 +- fs/squashfs/decompressor_multi.c | 16 +++++++--- fs/squashfs/decompressor_multi_percpu.c | 23 +++++++++----- fs/squashfs/decompressor_single.c | 15 ++++++--- fs/squashfs/squashfs.h | 23 +++++++++++--- fs/squashfs/squashfs_fs_sb.h | 3 +- fs/squashfs/super.c | 56 +++++++++++++++++++++++++++++++-- 9 files changed, 147 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 916e78fabcaa..218bacdd4298 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -54,9 +54,36 @@ config SQUASHFS_FILE_DIRECT endchoice +config SQUASHFS_DECOMP_SINGLE + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI + depends on SQUASHFS + def_bool n + +config SQUASHFS_DECOMP_MULTI_PERCPU + depends on SQUASHFS + def_bool n + +config SQUASHFS_CHOICE_DECOMP_BY_MOUNT + bool "Select the parallel decompression mode during mount" + depends on SQUASHFS + default n + select SQUASHFS_DECOMP_SINGLE + select SQUASHFS_DECOMP_MULTI + select SQUASHFS_DECOMP_MULTI_PERCPU + help + Compile all parallel decompression modes and specify the + decompression mode by setting "threads=" during mount. + threads= + + default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE + choice - prompt "Decompressor parallelisation options" + prompt "Select decompression parallel mode at compile time" depends on SQUASHFS + depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT help Squashfs now supports three parallelisation options for decompression. Each one exhibits various trade-offs between @@ -64,15 +91,17 @@ choice If in doubt, select "Single threaded compression" -config SQUASHFS_DECOMP_SINGLE +config SQUASHFS_COMPILE_DECOMP_SINGLE bool "Single threaded compression" + select SQUASHFS_DECOMP_SINGLE help Traditionally Squashfs has used single-threaded decompression. Only one block (data or metadata) can be decompressed at any one time. This limits CPU and memory usage to a minimum. -config SQUASHFS_DECOMP_MULTI +config SQUASHFS_COMPILE_DECOMP_MULTI bool "Use multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -85,8 +114,9 @@ config SQUASHFS_DECOMP_MULTI decompressors per core. It dynamically allocates decompressors on a demand basis. -config SQUASHFS_DECOMP_MULTI_PERCPU +config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU bool "Use percpu multiple decompressors for parallel I/O" + select SQUASHFS_DECOMP_MULTI_PERCPU help By default Squashfs uses a single decompressor but it gives poor performance on parallel I/O workloads when using multiple CPU @@ -95,7 +125,6 @@ config SQUASHFS_DECOMP_MULTI_PERCPU This decompressor implementation uses a maximum of one decompressor per core. It uses percpu variables to ensure decompression is load-balanced across the cores. - endchoice config SQUASHFS_XATTR diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 833aca92301f..bed3bb8b27fa 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - res = squashfs_decompress(msblk, bio, offset, length, output); + res = msblk->thread_ops->decompress(msblk, bio, offset, length, output); } else { res = copy_bio_to_actor(bio, output, offset, length); } diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index d57bef91ab08..8893cb9b4198 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags) if (IS_ERR(comp_opts)) return comp_opts; - stream = squashfs_decompressor_create(msblk, comp_opts); + stream = msblk->thread_ops->create(msblk, comp_opts); if (IS_ERR(stream)) kfree(comp_opts); diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index db9f12a3ea05..eb25432bd149 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -29,12 +29,11 @@ #define MAX_DECOMPRESSOR (num_online_cpus() * 2) -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return MAX_DECOMPRESSOR; } - struct squashfs_stream { void *comp_opts; struct list_head strm_list; @@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm, wake_up(&stream->wait); } -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -103,7 +102,7 @@ out: } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; if (stream) { @@ -180,7 +179,7 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, msblk->decompressor->name); return res; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index b881b9283b7f..1dfadf76ed9a 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -25,7 +25,7 @@ struct squashfs_stream { local_lock_t lock; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -59,7 +59,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream __percpu *percpu = (struct squashfs_stream __percpu *) msblk->stream; @@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct squashfs_stream *stream; + struct squashfs_stream __percpu *percpu = + (struct squashfs_stream __percpu *) msblk->stream; int res; - local_lock(&msblk->stream->lock); - stream = this_cpu_ptr(msblk->stream); + local_lock(&percpu->lock); + stream = this_cpu_ptr(percpu); res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); - local_unlock(&msblk->stream->lock); + local_unlock(&percpu->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", @@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return num_possible_cpus(); } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 4eb3d083d45e..6f161887710b 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -24,7 +24,7 @@ struct squashfs_stream { struct mutex mutex; }; -void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, +static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; @@ -49,7 +49,7 @@ out: return ERR_PTR(err); } -void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) +static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; @@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, +static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { @@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, return res; } -int squashfs_max_decompressors(void) +static int squashfs_max_decompressors(void) { return 1; } + +const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = { + .create = squashfs_decompressor_create, + .destroy = squashfs_decompressor_destroy, + .decompress = squashfs_decompress, + .max_decompressors = squashfs_max_decompressors, +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9783e01c8100..a6164fdf9435 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ -extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); -extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, - int, int, struct squashfs_page_actor *); -extern int squashfs_max_decompressors(void); + +struct squashfs_decompressor_thread_ops { + void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts); + void (*destroy)(struct squashfs_sb_info *msblk); + int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output); + int (*max_decompressors)(void); +}; + +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi; +#endif +#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU +extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu; +#endif /* export.c */ extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 1e90c2575f9b..f1e5dad8ae0a 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -53,7 +53,7 @@ struct squashfs_sb_info { __le64 *xattr_id_table; struct mutex meta_index_mutex; struct meta_index *meta_index; - struct squashfs_stream *stream; + void *stream; __le64 *inode_lookup_table; u64 inode_table; u64 directory_table; @@ -66,5 +66,6 @@ struct squashfs_sb_info { int xattr_ids; unsigned int ids; bool panic_on_errors; + const struct squashfs_decompressor_thread_ops *thread_ops; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 32565dafa7f3..aac3ea72a9ba 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -47,10 +47,12 @@ enum Opt_errors { enum squashfs_param { Opt_errors, + Opt_threads, }; struct squashfs_mount_opts { enum Opt_errors errors; + const struct squashfs_decompressor_thread_ops *thread_ops; }; static const struct constant_table squashfs_param_errors[] = { @@ -61,9 +63,29 @@ static const struct constant_table squashfs_param_errors[] = { static const struct fs_parameter_spec squashfs_fs_parameters[] = { fsparam_enum("errors", Opt_errors, squashfs_param_errors), + fsparam_string("threads", Opt_threads), {} }; +static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (strcmp(str, "single") == 0) { + opts->thread_ops = &squashfs_decompressor_single; + return 0; + } + if (strcmp(str, "multi") == 0) { + opts->thread_ops = &squashfs_decompressor_multi; + return 0; + } + if (strcmp(str, "percpu") == 0) { + opts->thread_ops = &squashfs_decompressor_percpu; + return 0; + } +#endif + return -EINVAL; +} + static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; @@ -78,6 +100,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para case Opt_errors: opts->errors = result.uint_32; break; + case Opt_threads: + if (squashfs_parse_param_threads(param->string, opts) != 0) + return -EINVAL; + break; default: return -EINVAL; } @@ -167,6 +193,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_bdev); goto failed_mount; } + msblk->thread_ops = opts->thread_ops; /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( @@ -252,7 +279,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - squashfs_max_decompressors(), msblk->block_size); + msblk->thread_ops->max_decompressors(), msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; @@ -383,7 +410,7 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); - squashfs_decompressor_destroy(msblk); + msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); @@ -435,6 +462,20 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root) else seq_puts(s, ",errors=continue"); +#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT + if (msblk->thread_ops == &squashfs_decompressor_single) { + seq_puts(s, ",threads=single"); + return 0; + } + if (msblk->thread_ops == &squashfs_decompressor_multi) { + seq_puts(s, ",threads=multi"); + return 0; + } + if (msblk->thread_ops == &squashfs_decompressor_percpu) { + seq_puts(s, ",threads=percpu"); + return 0; + } +#endif return 0; } @@ -446,6 +487,15 @@ static int squashfs_init_fs_context(struct fs_context *fc) if (!opts) return -ENOMEM; +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + opts->thread_ops = &squashfs_decompressor_single; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI) + opts->thread_ops = &squashfs_decompressor_multi; +#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) + opts->thread_ops = &squashfs_decompressor_percpu; +#else +#error "fail: unknown squashfs decompression thread mode?" +#endif fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; @@ -478,7 +528,7 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); - squashfs_decompressor_destroy(sbi); + sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); -- cgit v1.2.3 From fb40fe04f9df23114782d5edd1c5d017ae9d0ca8 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Wed, 19 Oct 2022 11:09:30 +0800 Subject: squashfs: allows users to configure the number of decompression threads The maximum number of threads in the decompressor_multi.c file is fixed and cannot be adjusted according to user needs. Therefore, the mount parameter needs to be added to allow users to configure the number of threads as required. The upper limit is num_online_cpus() * 2. Link: https://lkml.kernel.org/r/20221019030930.130456-3-nixiaoming@huawei.com Signed-off-by: Xiaoming Ni Reviewed-by: Phillip Lougher Cc: Jianguo Chen Cc: Jubin Zhong Cc: Zhang Yi Signed-off-by: Andrew Morton --- fs/squashfs/Kconfig | 16 ++++++++++-- fs/squashfs/decompressor_multi.c | 4 +-- fs/squashfs/squashfs_fs_sb.h | 1 + fs/squashfs/super.c | 55 +++++++++++++++++++++++++++++++++++----- 4 files changed, 66 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 218bacdd4298..60fc98bdf421 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -73,11 +73,10 @@ config SQUASHFS_CHOICE_DECOMP_BY_MOUNT select SQUASHFS_DECOMP_SINGLE select SQUASHFS_DECOMP_MULTI select SQUASHFS_DECOMP_MULTI_PERCPU + select SQUASHFS_MOUNT_DECOMP_THREADS help Compile all parallel decompression modes and specify the decompression mode by setting "threads=" during mount. - threads= - default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE choice @@ -127,6 +126,19 @@ config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU decompression is load-balanced across the cores. endchoice +config SQUASHFS_MOUNT_DECOMP_THREADS + bool "Add the mount parameter 'threads=' for squashfs" + depends on SQUASHFS + depends on SQUASHFS_DECOMP_MULTI + default n + help + Use threads= to set the decompression parallel mode and the number of threads. + If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y + threads= + else + threads=<2|3|...> + The upper limit is num_online_cpus() * 2. + config SQUASHFS_XATTR bool "Squashfs XATTR support" depends on SQUASHFS diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index eb25432bd149..416c53eedbd1 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -144,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, * If there is no available decomp and already full, * let's wait for releasing decomp from other users. */ - if (stream->avail_decomp >= MAX_DECOMPRESSOR) + if (stream->avail_decomp >= msblk->max_thread_num) goto wait; /* Let's allocate new decomp */ @@ -160,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk, } stream->avail_decomp++; - WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR); + WARN_ON(stream->avail_decomp > msblk->max_thread_num); mutex_unlock(&stream->mutex); break; diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index f1e5dad8ae0a..659082e9e51d 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -67,5 +67,6 @@ struct squashfs_sb_info { unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; + int max_thread_num; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index aac3ea72a9ba..1e428ca9414e 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -53,6 +53,7 @@ enum squashfs_param { struct squashfs_mount_opts { enum Opt_errors errors; const struct squashfs_decompressor_thread_ops *thread_ops; + int thread_num; }; static const struct constant_table squashfs_param_errors[] = { @@ -67,7 +68,8 @@ static const struct fs_parameter_spec squashfs_fs_parameters[] = { {} }; -static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) + +static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts) { #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT if (strcmp(str, "single") == 0) { @@ -86,6 +88,42 @@ static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_o return -EINVAL; } +static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts) +{ +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + int ret; + unsigned long num; + + ret = kstrtoul(str, 0, &num); + if (ret != 0) + return -EINVAL; + if (num > 1) { + opts->thread_ops = &squashfs_decompressor_multi; + if (num > opts->thread_ops->max_decompressors()) + return -EINVAL; + opts->thread_num = (int)num; + return 0; + } +#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE + if (num == 1) { + opts->thread_ops = &squashfs_decompressor_single; + opts->thread_num = 1; + return 0; + } +#endif +#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */ + return -EINVAL; +} + +static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) +{ + int ret = squashfs_parse_param_threads_str(str, opts); + + if (ret == 0) + return ret; + return squashfs_parse_param_threads_num(str, opts); +} + static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; @@ -194,6 +232,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) goto failed_mount; } msblk->thread_ops = opts->thread_ops; + if (opts->thread_num == 0) { + msblk->max_thread_num = msblk->thread_ops->max_decompressors(); + } else { + msblk->max_thread_num = opts->thread_num; + } /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( @@ -279,7 +322,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", - msblk->thread_ops->max_decompressors(), msblk->block_size); + msblk->max_thread_num, msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; @@ -467,14 +510,13 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",threads=single"); return 0; } - if (msblk->thread_ops == &squashfs_decompressor_multi) { - seq_puts(s, ",threads=multi"); - return 0; - } if (msblk->thread_ops == &squashfs_decompressor_percpu) { seq_puts(s, ",threads=percpu"); return 0; } +#endif +#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS + seq_printf(s, ",threads=%d", msblk->max_thread_num); #endif return 0; } @@ -496,6 +538,7 @@ static int squashfs_init_fs_context(struct fs_context *fc) #else #error "fail: unknown squashfs decompression thread mode?" #endif + opts->thread_num = 0; fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; -- cgit v1.2.3 From 13b6269dd022aaa69ca8d1df374ab327504121cf Mon Sep 17 00:00:00 2001 From: Shang XiaoJing Date: Tue, 1 Nov 2022 19:15:33 +0800 Subject: ocfs2: fix memory leak in ocfs2_stack_glue_init() ocfs2_table_header should be free in ocfs2_stack_glue_init() if ocfs2_sysfs_init() failed, otherwise kmemleak will report memleak. BUG: memory leak unreferenced object 0xffff88810eeb5800 (size 128): comm "modprobe", pid 4507, jiffies 4296182506 (age 55.888s) hex dump (first 32 bytes): c0 40 14 a0 ff ff ff ff 00 00 00 00 01 00 00 00 .@.............. 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000001e59e1cd>] __register_sysctl_table+0xca/0xef0 [<00000000c04f70f7>] 0xffffffffa0050037 [<000000001bd12912>] do_one_initcall+0xdb/0x480 [<0000000064f766c9>] do_init_module+0x1cf/0x680 [<000000002ba52db0>] load_module+0x6441/0x6f20 [<000000009772580d>] __do_sys_finit_module+0x12f/0x1c0 [<00000000380c1f22>] do_syscall_64+0x3f/0x90 [<000000004cf473bc>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Link: https://lkml.kernel.org/r/41651ca1-432a-db34-eb97-d35744559de1@linux.alibaba.com Fixes: 3878f110f71a ("ocfs2: Move the hb_ctl_path sysctl into the stack glue.") Signed-off-by: Shang XiaoJing Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stackglue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 317126261523..a8d5ca98fa57 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header; static int __init ocfs2_stack_glue_init(void) { + int ret; + strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); @@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void) return -ENOMEM; /* or something. */ } - return ocfs2_sysfs_init(); + ret = ocfs2_sysfs_init(); + if (ret) + unregister_sysctl_table(ocfs2_table_header); + + return ret; } static void __exit ocfs2_stack_glue_exit(void) -- cgit v1.2.3 From c7e8d3279c984e41165a7b510759bd1771ac3941 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 1 Nov 2022 15:33:43 +0800 Subject: squashfs: fix null-ptr-deref in squashfs_fill_super When squashfs_read_table() returns an error or `sb->s_magic != SQUASHFS_MAGIC`, enters the error branch and calls msblk->thread_ops->destroy(msblk) to destroy msblk. However, msblk->thread_ops has not been initialized. Therefore, the following problem is triggered: ================================================================== BUG: KASAN: null-ptr-deref in squashfs_fill_super+0xe7a/0x13b0 Read of size 8 at addr 0000000000000008 by task swapper/0/1 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc3-next-20221031 #367 Call Trace: dump_stack_lvl+0x73/0x9f print_report+0x743/0x759 kasan_report+0xc0/0x120 __asan_load8+0xd3/0x140 squashfs_fill_super+0xe7a/0x13b0 get_tree_bdev+0x27b/0x450 squashfs_get_tree+0x19/0x30 vfs_get_tree+0x49/0x150 path_mount+0xaae/0x1350 init_mount+0xad/0x100 do_mount_root+0xbc/0x1d0 mount_block_root+0x173/0x316 mount_root+0x223/0x236 prepare_namespace+0x1eb/0x237 kernel_init_freeable+0x528/0x576 kernel_init+0x29/0x250 ret_from_fork+0x1f/0x30 ================================================================== To solve this issue, msblk->thread_ops is initialized immediately after msblk is assigned a value. Link: https://lkml.kernel.org/r/20221101073343.3961562-1-libaokun1@huawei.com Fixes: b0645770d3c7 ("squashfs: add the mount parameter theads=") Signed-off-by: Baokun Li Reviewed-by: Xiaoming Ni Reviewed-by: Phillip Lougher Cc: Yu Kuai Cc: Zhang Yi Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 1e428ca9414e..7d5265a39d20 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -197,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; } msblk = sb->s_fs_info; + msblk->thread_ops = opts->thread_ops; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); @@ -231,7 +232,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_bdev); goto failed_mount; } - msblk->thread_ops = opts->thread_ops; + if (opts->thread_num == 0) { msblk->max_thread_num = msblk->thread_ops->max_decompressors(); } else { -- cgit v1.2.3 From 2e41f274f9aa71cdcc69dc1f26a3f9304a651804 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:16 +0900 Subject: libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value Patch series "fix error when writing negative value to simple attribute files". The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), but some attribute files want to accept a negative value. This patch (of 3): The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value. This adds DEFINE_SIMPLE_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-1-akinobu.mita@gmail.com Link: https://lkml.kernel.org/r/20220919172418.45257-2-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- fs/libfs.c | 22 +++++++++++++++++++--- include/linux/fs.h | 12 ++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 682d56345a1c..aada4e7c8713 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -995,8 +995,8 @@ out: EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ -ssize_t simple_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; @@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - ret = kstrtoull(attr->set_buf, 0, &val); + if (is_signed) + ret = kstrtoll(attr->set_buf, 0, &val); + else + ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); @@ -1027,8 +1030,21 @@ out: mutex_unlock(&attr->mutex); return ret; } + +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(simple_attr_write); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(simple_attr_write_signed); + /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on diff --git a/include/linux/fs.h b/include/linux/fs.h index e654435f1651..452700c5fa1d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3485,7 +3485,7 @@ void simple_transaction_set(struct file *file, size_t n); * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ -#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -3496,10 +3496,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ - .write = simple_attr_write, \ + .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } +#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { @@ -3514,6 +3520,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); -- cgit v1.2.3 From d472cf797c4e268613dbce5ec9b95d0bcae19ecb Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:18 +0900 Subject: debugfs: fix error when writing negative value to atomic_t debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value for a debugfs file created by debugfs_create_atomic_t(). This restores the previous behaviour by introducing DEFINE_DEBUGFS_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-4-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- Documentation/fault-injection/fault-injection.rst | 10 ++++---- fs/debugfs/file.c | 28 ++++++++++++++++++----- include/linux/debugfs.h | 19 +++++++++++++-- 3 files changed, 43 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 17779a2772e5..5f6454b9dbd4 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -83,9 +83,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail*/times: specifies how many times failures may happen at most. A value of -1 - means "no limit". Note, though, that this file only accepts unsigned - values. So, if you want to specify -1, you better use 'printf' instead - of 'echo', e.g.: $ printf %#x -1 > times + means "no limit". - /sys/kernel/debug/fail*/space: @@ -284,7 +282,7 @@ Application Examples echo Y > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait @@ -338,7 +336,7 @@ Application Examples echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait @@ -369,7 +367,7 @@ Application Examples echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 100 > /sys/kernel/debug/$FAILTYPE/probability echo 0 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 1 > /sys/kernel/debug/$FAILTYPE/verbose diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index ddb3fc258df9..b54f470e0d03 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, } EXPORT_SYMBOL_GPL(debugfs_attr_read); -ssize_t debugfs_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; @@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; - ret = simple_attr_write(file, buf, len, ppos); + if (is_signed) + ret = simple_attr_write_signed(file, buf, len, ppos); + else + ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(debugfs_attr_write); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); + static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, @@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index f60674692d36..ea2d919fd9c7 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,7 +45,7 @@ struct debugfs_u32_array { extern struct dentry *arch_debugfs_dir; -#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -56,10 +56,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = debugfs_attr_read, \ - .write = debugfs_attr_write, \ + .write = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write, \ .llseek = no_llseek, \ } +#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); #if defined(CONFIG_DEBUG_FS) @@ -102,6 +108,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name); @@ -254,6 +262,13 @@ static inline ssize_t debugfs_attr_write(struct file *file, return -ENODEV; } +static inline ssize_t debugfs_attr_write_signed(struct file *file, + const char __user *buf, + size_t len, loff_t *ppos) +{ + return -ENODEV; +} + static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, char *new_name) { -- cgit v1.2.3 From ce2fcf1516d674a174d9b34d1e1024d64de9fba3 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Wed, 9 Nov 2022 15:46:27 +0800 Subject: ocfs2: fix memory leak in ocfs2_mount_volume() There is a memory leak reported by kmemleak: unreferenced object 0xffff88810cc65e60 (size 32): comm "mount.ocfs2", pid 23753, jiffies 4302528942 (age 34735.105s) hex dump (first 32 bytes): 10 00 00 00 00 00 00 00 00 01 01 01 01 01 01 01 ................ 01 01 01 01 01 01 01 01 00 00 00 00 00 00 00 00 ................ backtrace: [] __kmalloc+0x4d/0x150 [] ocfs2_compute_replay_slots+0x121/0x330 [ocfs2] [] ocfs2_check_volume+0x485/0x900 [ocfs2] [] ocfs2_mount_volume.isra.0+0x1e9/0x650 [ocfs2] [] ocfs2_fill_super+0xe0b/0x1740 [ocfs2] [] mount_bdev+0x312/0x400 [] legacy_get_tree+0xed/0x1d0 [] vfs_get_tree+0x7d/0x230 [] path_mount+0xd62/0x1760 [] do_mount+0xca/0xe0 [] __x64_sys_mount+0x12c/0x1a0 [] do_syscall_64+0x35/0x80 [] entry_SYSCALL_64_after_hwframe+0x46/0xb0 This call stack is related to two problems. Firstly, the ocfs2 super uses "replay_map" to trace online/offline slots, in order to recover offline slots during recovery and mount. But when ocfs2_truncate_log_init() returns an error in ocfs2_mount_volume(), the memory of "replay_map" will not be freed in error handling path. Secondly, the memory of "replay_map" will not be freed if d_make_root() returns an error in ocfs2_fill_super(). But the memory of "replay_map" will be freed normally when completing recovery and mount in ocfs2_complete_mount_recovery(). Fix the first problem by adding error handling path to free "replay_map" when ocfs2_truncate_log_init() fails. And fix the second problem by calling ocfs2_free_replay_slots(osb) in the error handling path "out_dismount". In addition, since ocfs2_free_replay_slots() is static, it is necessary to remove its static attribute and declare it in header file. Link: https://lkml.kernel.org/r/20221109074627.2303950-1-lizetao1@huawei.com Fixes: 9140db04ef18 ("ocfs2: recover orphans in offline slots during recovery and mount") Signed-off-by: Li Zetao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/journal.c | 2 +- fs/ocfs2/journal.h | 1 + fs/ocfs2/super.c | 5 ++++- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 126671e6caed..3fb98b4569a2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, replay_map->rm_state = REPLAY_DONE; } -static void ocfs2_free_replay_slots(struct ocfs2_super *osb) +void ocfs2_free_replay_slots(struct ocfs2_super *osb) { struct ocfs2_replay_map *replay_map = osb->replay_map; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 969d0aa28718..41c382f68529 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); +void ocfs2_free_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 42c993e53924..0b0e6a132101 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) out_dismount: atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); + ocfs2_free_replay_slots(osb); ocfs2_dismount_volume(sb, 1); goto out; @@ -1822,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_truncate_log_init(osb); if (status < 0) { mlog_errno(status); - goto out_system_inodes; + goto out_check_volume; } ocfs2_super_unlock(osb, 1); return 0; +out_check_volume: + ocfs2_free_replay_slots(osb); out_system_inodes: if (osb->local_alloc_state == OCFS2_LA_ENABLED) ocfs2_shutdown_local_alloc(osb); -- cgit v1.2.3 From 811b99fd237e17230d660c6000021f23ce3f3985 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Fri, 11 Nov 2022 02:56:48 -0500 Subject: fat (exportfs): fix some kernel-doc warnings Fix the following W=1 kernel build warning(s): fs/fat/nfs.c:21: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst fs/fat/nfs.c:139: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Link: https://lkml.kernel.org/r/20221111075648.4005-1-liubo03@inspur.com Signed-off-by: Bo Liu Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/nfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index af191371c352..3626eb585a98 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -17,7 +17,7 @@ struct fat_fid { #define FAT_FID_SIZE_WITHOUT_PARENT 3 #define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32)) -/** +/* * Look up a directory inode given its starting cluster. */ static struct inode *fat_dget(struct super_block *sb, int i_logstart) @@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp, return type; } -/** +/* * Map a NFS file handle to a corresponding dentry. * The dentry may or may not be connected to the filesystem root. */ -- cgit v1.2.3 From c9a934c7d88413a35861387a11e901554810b122 Mon Sep 17 00:00:00 2001 From: Alexey Asemov Date: Sun, 27 Nov 2022 09:46:38 +0300 Subject: ocfs2: always read both high and low parts of dinode link count When filesystem is using indexed-dirs feature, maximum link count values can spill over to i_links_count_hi, up to OCFS2_DX_LINK_MAX links. ocfs2_read_links_count() checks for OCFS2_INDEXED_DIR_FL flag in dinode, but this flag is only valid for directories so for files the check causes high part of the link count not being read back from file dinodes resulting in wrong link count value when file has >65535 links. As ocfs2_set_links_count() always writes both high and low parts of link count, the flag check on reading may be removed. Link: https://lkml.kernel.org/r/cbfca02b-b39f-89de-e1a8-904a6c60407e@alex-at.net Signed-off-by: Alexey Asemov Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ocfs2.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 740b64238312..a503c553bab2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di) u32 nlink = le16_to_cpu(di->i_links_count); u32 hi = le16_to_cpu(di->i_links_count_hi); - if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL)) - nlink |= (hi << OCFS2_LINKS_HI_SHIFT); + nlink |= (hi << OCFS2_LINKS_HI_SHIFT); return nlink; } -- cgit v1.2.3 From 8d824e69d9f3fa3121b2dda25053bae71e2460d2 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Wed, 30 Nov 2022 06:59:59 +0000 Subject: hfs: fix OOB Read in __hfs_brec_find Syzbot reported a OOB read bug: ================================================================== BUG: KASAN: slab-out-of-bounds in hfs_strcmp+0x117/0x190 fs/hfs/string.c:84 Read of size 1 at addr ffff88807eb62c4e by task kworker/u4:1/11 CPU: 1 PID: 11 Comm: kworker/u4:1 Not tainted 6.1.0-rc6-syzkaller-00308-g644e9524388a #0 Workqueue: writeback wb_workfn (flush-7:0) Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 print_address_description+0x74/0x340 mm/kasan/report.c:284 print_report+0x107/0x1f0 mm/kasan/report.c:395 kasan_report+0xcd/0x100 mm/kasan/report.c:495 hfs_strcmp+0x117/0x190 fs/hfs/string.c:84 __hfs_brec_find+0x213/0x5c0 fs/hfs/bfind.c:75 hfs_brec_find+0x276/0x520 fs/hfs/bfind.c:138 hfs_write_inode+0x34c/0xb40 fs/hfs/inode.c:462 write_inode fs/fs-writeback.c:1440 [inline] If the input inode of hfs_write_inode() is incorrect: struct inode struct hfs_inode_info struct hfs_cat_key struct hfs_name u8 len # len is greater than HFS_NAMELEN(31) which is the maximum length of an HFS filename OOB read occurred: hfs_write_inode() hfs_brec_find() __hfs_brec_find() hfs_cat_keycmp() hfs_strcmp() # OOB read occurred due to len is too large Fix this by adding a Check on len in hfs_write_inode() before calling hfs_brec_find(). Link: https://lkml.kernel.org/r/20221130065959.2168236-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reported-by: Cc: Damien Le Moal Cc: Ira Weiny Cc: Jeff Layton Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Nanyong Sun Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton --- fs/hfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..a0746be3c1de 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -458,6 +458,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) /* panic? */ return -EIO; + if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) + return -EIO; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) /* panic? */ -- cgit v1.2.3 From c53ed55cb275344086e32a7080a6b19cb183650b Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 2 Dec 2022 03:00:38 +0000 Subject: hfs: Fix OOB Write in hfs_asc2mac Syzbot reported a OOB Write bug: loop0: detected capacity change from 0 to 64 ================================================================== BUG: KASAN: slab-out-of-bounds in hfs_asc2mac+0x467/0x9a0 fs/hfs/trans.c:133 Write of size 1 at addr ffff88801848314e by task syz-executor391/3632 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1b1/0x28e lib/dump_stack.c:106 print_address_description+0x74/0x340 mm/kasan/report.c:284 print_report+0x107/0x1f0 mm/kasan/report.c:395 kasan_report+0xcd/0x100 mm/kasan/report.c:495 hfs_asc2mac+0x467/0x9a0 fs/hfs/trans.c:133 hfs_cat_build_key+0x92/0x170 fs/hfs/catalog.c:28 hfs_lookup+0x1ab/0x2c0 fs/hfs/dir.c:31 lookup_open fs/namei.c:3391 [inline] open_last_lookups fs/namei.c:3481 [inline] path_openat+0x10e6/0x2df0 fs/namei.c:3710 do_filp_open+0x264/0x4f0 fs/namei.c:3740 If in->len is much larger than HFS_NAMELEN(31) which is the maximum length of an HFS filename, a OOB write could occur in hfs_asc2mac(). In that case, when the dst reaches the boundary, the srclen is still greater than 0, which causes a OOB write. Fix this by adding a check on dstlen in while() before writing to dst address. Link: https://lkml.kernel.org/r/20221202030038.1391945-1-zhangpeng362@huawei.com Fixes: 328b92278650 ("[PATCH] hfs: NLS support") Signed-off-by: ZhangPeng Reviewed-by: Viacheslav Dubeyko Reported-by: Signed-off-by: Andrew Morton --- fs/hfs/trans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index 39f5e343bf4d..fdb0edb8a607 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr if (nls_io) { wchar_t ch; - while (srclen > 0) { + while (srclen > 0 && dstlen > 0) { size = nls_io->char2uni(src, srclen, &ch); if (size < 0) { ch = '?'; -- cgit v1.2.3 From 9f2b5debc07073e6dfdd774e3594d0224b991927 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Wed, 7 Dec 2022 03:05:40 +0000 Subject: hfsplus: fix bug causing custom uid and gid being unable to be assigned with mount Despite specifying UID and GID in mount command, the specified UID and GID were not being assigned. This patch fixes this issue. Link: https://lkml.kernel.org/r/C0264BF5-059C-45CF-B8DA-3A3BD2C803A2@live.com Signed-off-by: Aditya Garg Reviewed-by: Viacheslav Dubeyko Cc: Signed-off-by: Andrew Morton --- fs/hfsplus/hfsplus_fs.h | 2 ++ fs/hfsplus/inode.c | 4 ++-- fs/hfsplus/options.c | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a5db2e3b2980..6aa919e59483 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -198,6 +198,8 @@ struct hfsplus_sb_info { #define HFSPLUS_SB_HFSX 3 #define HFSPLUS_SB_CASEFOLD 4 #define HFSPLUS_SB_NOBARRIER 5 +#define HFSPLUS_SB_UID 6 +#define HFSPLUS_SB_GID 7 static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..b675581aa9d0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode, mode = be16_to_cpu(perms->mode); i_uid_write(inode, be32_to_cpu(perms->owner)); - if (!i_uid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode)) inode->i_uid = sbi->uid; i_gid_write(inode, be32_to_cpu(perms->group)); - if (!i_gid_read(inode) && !mode) + if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode)) inode->i_gid = sbi->gid; if (dir) { diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 047e05c57560..c94a58762ad6 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!uid_valid(sbi->uid)) { pr_err("invalid uid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_UID, &sbi->flags); } break; case opt_gid: @@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) if (!gid_valid(sbi->gid)) { pr_err("invalid gid specified\n"); return 0; + } else { + set_bit(HFSPLUS_SB_GID, &sbi->flags); } break; case opt_part: -- cgit v1.2.3