From eb02d39ad3099c3dcd96c5b3fdc0303541766ed1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:31:16 -0700 Subject: netdevice.h: fix proto_down_reason kernel-doc warning Fix kernel-doc warning in : ../include/linux/netdevice.h:2158: warning: Function parameter or member 'proto_down_reason' not described in 'net_device' Fixes: 829eb208e80d ("rtnetlink: add support for protodown reason") Signed-off-by: Randy Dunlap Acked-by: Roopa Prabhu Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..bf0e313486be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1784,6 +1784,7 @@ enum netdev_priv_flags { * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers * + * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one -- cgit v1.2.3 From ffa59b0b396cb2c0ea6712ff30ee05c35d4c9c8d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:32:30 -0700 Subject: netdevice.h: fix xdp_state kernel-doc warning Fix kernel-doc warning in : ../include/linux/netdevice.h:2158: warning: Function parameter or member 'xdp_state' not described in 'net_device' Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Randy Dunlap Cc: Andrii Nakryiko Cc: Alexei Starovoitov Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf0e313486be..7bd4fcdd0738 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1849,6 +1849,7 @@ enum netdev_priv_flags { * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state + * @xdp_state: stores info on attached XDP BPF programs * * FIXME: cleanup struct net_device such that network protocol info * moves out. -- cgit v1.2.3 From 67cc570edaa02016a8685a06a0ee91f05a6277d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 27 Aug 2020 19:28:42 +0200 Subject: netfilter: nf_tables: coalesce multiple notifications into one skbuff On x86_64, each notification results in one skbuff allocation which consumes at least 768 bytes due to the skbuff overhead. This patch coalesces several notifications into one single skbuff, so each notification consumes at least ~211 bytes, that ~3.5 times less memory consumption. As a result, this is reducing the chances to exhaust the netlink socket receive buffer. Rule of thumb is that each notification batch only contains netlink messages whose report flag is the same, nfnetlink_send() requires this to do appropriate delivery to userspace, either via unicast (echo mode) or multicast (monitor mode). The skbuff control buffer is used to annotate the report flag for later handling at the new coalescing routine. The batch skbuff notification size is NLMSG_GOODSIZE, using a larger skbuff would allow for more socket receiver buffer savings (to amortize the cost of the skbuff even more), however, going over that size might break userspace applications, so let's be conservative and stick to NLMSG_GOODSIZE. Reported-by: Phil Sutter Acked-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 70 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index a1a8d45adb42..6c0806bd8d1e 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -8,6 +8,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; struct list_head module_list; + struct list_head notify_list; struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b7dc1cbf40ea..4603b667973a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -684,6 +684,18 @@ nla_put_failure: return -1; } +struct nftnl_skb_parms { + bool report; +}; +#define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb)) + +static void nft_notify_enqueue(struct sk_buff *skb, bool report, + struct list_head *notify_list) +{ + NFT_CB(skb).report = report; + list_add_tail(&skb->list, notify_list); +} + static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct sk_buff *skb; @@ -715,8 +727,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -1468,8 +1479,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -2807,8 +2817,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -3837,8 +3846,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report, - gfp_flags); + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); return; err: nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -4959,8 +4967,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, - GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -6275,7 +6282,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, goto err; } - nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); + nft_notify_enqueue(skb, report, &net->nft.notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -7085,8 +7092,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, goto err; } - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, - ctx->report, GFP_KERNEL); + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -7695,6 +7701,41 @@ static void nf_tables_commit_release(struct net *net) mutex_unlock(&net->nft.commit_mutex); } +static void nft_commit_notify(struct net *net, u32 portid) +{ + struct sk_buff *batch_skb = NULL, *nskb, *skb; + unsigned char *data; + int len; + + list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) { + if (!batch_skb) { +new_batch: + batch_skb = skb; + len = NLMSG_GOODSIZE - skb->len; + list_del(&skb->list); + continue; + } + len -= skb->len; + if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) { + data = skb_put(batch_skb, skb->len); + memcpy(data, skb->data, skb->len); + list_del(&skb->list); + kfree_skb(skb); + continue; + } + nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, + NFT_CB(batch_skb).report, GFP_KERNEL); + goto new_batch; + } + + if (batch_skb) { + nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, + NFT_CB(batch_skb).report, GFP_KERNEL); + } + + WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; @@ -7897,6 +7938,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } + nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_release(net); @@ -8721,6 +8763,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); INIT_LIST_HEAD(&net->nft.module_list); + INIT_LIST_HEAD(&net->nft.notify_list); mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -8737,6 +8780,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); WARN_ON_ONCE(!list_empty(&net->nft.module_list)); + WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); } static struct pernet_operations nf_tables_net_ops = { -- cgit v1.2.3 From 4a009cb04aeca0de60b73f37b102573354214b52 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Sep 2020 01:27:40 -0700 Subject: net: add __must_check to skb_put_padto() skb_put_padto() and __skb_put_padto() callers must check return values or risk use-after-free. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ed9bea924dc3..04a18e01b362 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3223,8 +3223,9 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ -static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, - bool free_on_error) +static inline int __must_check __skb_put_padto(struct sk_buff *skb, + unsigned int len, + bool free_on_error) { unsigned int size = skb->len; @@ -3247,7 +3248,7 @@ static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ -static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) +static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } -- cgit v1.2.3 From 2d2fe8433796603091ac8ea235b9165ac5a85f9a Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 9 Sep 2020 20:43:08 +0300 Subject: net: qed: Disable aRFS for NPAR and 100G In CMT and NPAR the PF is unknown when the GFS block processes the packet. Therefore cannot use searcher as it has a per PF database, and thus ARFS must be disabled. Fixes: d51e4af5c209 ("qed: aRFS infrastructure support") Signed-off-by: Manish Chopra Signed-off-by: Igor Russkikh Signed-off-by: Michal Kalderon Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 11 ++++++++++- drivers/net/ethernet/qlogic/qed/qed_l2.c | 3 +++ drivers/net/ethernet/qlogic/qed/qed_main.c | 2 ++ include/linux/qed/qed_if.h | 1 + 4 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index b8f076e4e6b8..3db181f3617a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -4253,7 +4253,8 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) | BIT(QED_MF_LLH_PROTO_CLSS) | BIT(QED_MF_LL2_NON_UNICAST) | - BIT(QED_MF_INTER_PF_SWITCH); + BIT(QED_MF_INTER_PF_SWITCH) | + BIT(QED_MF_DISABLE_ARFS); break; case NVM_CFG1_GLOB_MF_MODE_DEFAULT: cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) | @@ -4266,6 +4267,14 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n", cdev->mf_bits); + + /* In CMT the PF is unknown when the GFS block processes the + * packet. Therefore cannot use searcher as it has a per PF + * database, and thus ARFS must be disabled. + * + */ + if (QED_IS_CMT(cdev)) + cdev->mf_bits |= BIT(QED_MF_DISABLE_ARFS); } DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n", diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index 4c6ac8862744..07824bf9d68d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -1980,6 +1980,9 @@ void qed_arfs_mode_configure(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_arfs_config_params *p_cfg_params) { + if (test_bit(QED_MF_DISABLE_ARFS, &p_hwfn->cdev->mf_bits)) + return; + if (p_cfg_params->mode != QED_FILTER_CONFIG_MODE_DISABLE) { qed_gft_config(p_hwfn, p_ptt, p_hwfn->rel_pf_id, p_cfg_params->tcp, diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index f39f629242a1..50e5eb22e60a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -444,6 +444,8 @@ int qed_fill_dev_info(struct qed_dev *cdev, dev_info->fw_eng = FW_ENGINEERING_VERSION; dev_info->b_inter_pf_switch = test_bit(QED_MF_INTER_PF_SWITCH, &cdev->mf_bits); + if (!test_bit(QED_MF_DISABLE_ARFS, &cdev->mf_bits)) + dev_info->b_arfs_capable = true; dev_info->tx_switching = true; if (hw_info->b_wol_support == QED_WOL_SUPPORT_PME) diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index cd6a5c7e56eb..cdd73afc4c46 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -623,6 +623,7 @@ struct qed_dev_info { #define QED_MFW_VERSION_3_OFFSET 24 u32 flash_size; + bool b_arfs_capable; bool b_inter_pf_switch; bool tx_switching; bool rdma_supported; -- cgit v1.2.3 From 83896b0bd8223ac33bcc609bcc82a57a587002ff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 10 Sep 2020 04:46:40 -0400 Subject: net: Fix broken NETIF_F_CSUM_MASK spell in netdev_features.h Remove the weird space inside the NETIF_F_CSUM_MASK. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2cc3cf80b49a..0b17c4322b09 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -193,7 +193,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) #define NETIF_F_GSO_MASK (__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \ __NETIF_F_BIT(NETIF_F_GSO_SHIFT)) -/* List of IP checksum features. Note that NETIF_F_ HW_CSUM should not be +/* List of IP checksum features. Note that NETIF_F_HW_CSUM should not be * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- * this would be contradictory */ -- cgit v1.2.3 From 553d87b658fed0e22a0f86b4f1b093c39d3e3074 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 10 Sep 2020 15:34:39 +0200 Subject: netlink: fix doc about nlmsg_parse/nla_validate There is no @validate argument. CC: Johannes Berg Fixes: 3de644035446 ("netlink: re-add parse/validate functions in strict mode") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/netlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index c0411f14fb53..8e0eb2c9c528 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -726,7 +726,6 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected - * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() @@ -824,7 +823,6 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy - * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the -- cgit v1.2.3 From 129134e5415d46f38b9978b3809af94ed649b57d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 11 Sep 2020 05:07:58 +0200 Subject: media: media/v4l2: remove V4L2_FLAG_MEMORY_NON_CONSISTENT flag The patch partially reverts some of the UAPI bits of the buffer cache management hints. Namely, the queue consistency (memory coherency) user-space hint because, as it turned out, the kernel implementation of this feature was misusing DMA_ATTR_NON_CONSISTENT. The patch reverts both kernel and user space parts: removes the DMA consistency attr functions, rolls back changes to v4l2_requestbuffers, v4l2_create_buffers structures and corresponding UAPI functions (plus compat32 layer) and cleans up the documentation. [hverkuil: fixed a few typos in the commit log] [hverkuil: fixed vb2_core_reqbufs call in drivers/media/dvb-core/dvb_vb2.c] [mchehab: fixed a typo in the commit log: revers->reverts] Signed-off-by: Christoph Hellwig Signed-off-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/buffer.rst | 17 -------- .../userspace-api/media/v4l/vidioc-create-bufs.rst | 6 +-- .../userspace-api/media/v4l/vidioc-reqbufs.rst | 12 +----- drivers/media/common/videobuf2/videobuf2-core.c | 46 +++------------------- .../media/common/videobuf2/videobuf2-dma-contig.c | 19 --------- drivers/media/common/videobuf2/videobuf2-dma-sg.c | 3 +- drivers/media/common/videobuf2/videobuf2-v4l2.c | 18 +-------- drivers/media/dvb-core/dvb_vb2.c | 2 +- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 10 +---- drivers/media/v4l2-core/v4l2-ioctl.c | 5 ++- include/media/videobuf2-core.h | 7 +--- include/uapi/linux/videodev2.h | 13 +----- 12 files changed, 23 insertions(+), 135 deletions(-) (limited to 'include') diff --git a/Documentation/userspace-api/media/v4l/buffer.rst b/Documentation/userspace-api/media/v4l/buffer.rst index 57e752aaf414..2044ed13cd9d 100644 --- a/Documentation/userspace-api/media/v4l/buffer.rst +++ b/Documentation/userspace-api/media/v4l/buffer.rst @@ -701,23 +701,6 @@ Memory Consistency Flags :stub-columns: 0 :widths: 3 1 4 - * .. _`V4L2-FLAG-MEMORY-NON-CONSISTENT`: - - - ``V4L2_FLAG_MEMORY_NON_CONSISTENT`` - - 0x00000001 - - A buffer is allocated either in consistent (it will be automatically - coherent between the CPU and the bus) or non-consistent memory. The - latter can provide performance gains, for instance the CPU cache - sync/flush operations can be avoided if the buffer is accessed by the - corresponding device only and the CPU does not read/write to/from that - buffer. However, this requires extra care from the driver -- it must - guarantee memory consistency by issuing a cache flush/sync when - consistency is needed. If this flag is set V4L2 will attempt to - allocate the buffer in non-consistent memory. The flag takes effect - only if the buffer is used for :ref:`memory mapping ` I/O and the - queue reports the :ref:`V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS - ` capability. - .. c:type:: v4l2_memory enum v4l2_memory diff --git a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst index f2a702870fad..12cf6b44f414 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-create-bufs.rst @@ -120,13 +120,9 @@ than the number requested. If you want to just query the capabilities without making any other changes, then set ``count`` to 0, ``memory`` to ``V4L2_MEMORY_MMAP`` and ``format.type`` to the buffer type. - * - __u32 - - ``flags`` - - Specifies additional buffer management attributes. - See :ref:`memory-flags`. * - __u32 - - ``reserved``\ [6] + - ``reserved``\ [7] - A place holder for future extensions. Drivers and applications must set the array to zero. diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst index 75d894d9c36c..0e3e2fde65e8 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst @@ -112,17 +112,10 @@ aborting or finishing any DMA in progress, an implicit ``V4L2_MEMORY_MMAP`` and ``type`` set to the buffer type. This will free any previously allocated buffers, so this is typically something that will be done at the start of the application. - * - union { - - (anonymous) - * - __u32 - - ``flags`` - - Specifies additional buffer management attributes. - See :ref:`memory-flags`. * - __u32 - ``reserved``\ [1] - - Kept for backwards compatibility. Use ``flags`` instead. - * - } - - + - A place holder for future extensions. Drivers and applications + must set the array to zero. .. tabularcolumns:: |p{6.1cm}|p{2.2cm}|p{8.7cm}| @@ -169,7 +162,6 @@ aborting or finishing any DMA in progress, an implicit - This capability is set by the driver to indicate that the queue supports cache and memory management hints. However, it's only valid when the queue is used for :ref:`memory mapping ` streaming I/O. See - :ref:`V4L2_FLAG_MEMORY_NON_CONSISTENT `, :ref:`V4L2_BUF_FLAG_NO_CACHE_INVALIDATE ` and :ref:`V4L2_BUF_FLAG_NO_CACHE_CLEAN `. diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index f544d3393e9d..4eab6d81cce1 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -721,39 +721,14 @@ int vb2_verify_memory_type(struct vb2_queue *q, } EXPORT_SYMBOL(vb2_verify_memory_type); -static void set_queue_consistency(struct vb2_queue *q, bool consistent_mem) -{ - q->dma_attrs &= ~DMA_ATTR_NON_CONSISTENT; - - if (!vb2_queue_allows_cache_hints(q)) - return; - if (!consistent_mem) - q->dma_attrs |= DMA_ATTR_NON_CONSISTENT; -} - -static bool verify_consistency_attr(struct vb2_queue *q, bool consistent_mem) -{ - bool queue_is_consistent = !(q->dma_attrs & DMA_ATTR_NON_CONSISTENT); - - if (consistent_mem != queue_is_consistent) { - dprintk(q, 1, "memory consistency model mismatch\n"); - return false; - } - return true; -} - int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count) + unsigned int *count) { unsigned int num_buffers, allocated_buffers, num_planes = 0; unsigned plane_sizes[VB2_MAX_PLANES] = { }; - bool consistent_mem = true; unsigned int i; int ret; - if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) - consistent_mem = false; - if (q->streaming) { dprintk(q, 1, "streaming active\n"); return -EBUSY; @@ -765,8 +740,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, } if (*count == 0 || q->num_buffers != 0 || - (q->memory != VB2_MEMORY_UNKNOWN && q->memory != memory) || - !verify_consistency_attr(q, consistent_mem)) { + (q->memory != VB2_MEMORY_UNKNOWN && q->memory != memory)) { /* * We already have buffers allocated, so first check if they * are not in use and can be freed. @@ -803,7 +777,6 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, num_buffers = min_t(unsigned int, num_buffers, VB2_MAX_FRAME); memset(q->alloc_devs, 0, sizeof(q->alloc_devs)); q->memory = memory; - set_queue_consistency(q, consistent_mem); /* * Ask the driver how many buffers and planes per buffer it requires. @@ -888,18 +861,14 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, EXPORT_SYMBOL_GPL(vb2_core_reqbufs); int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count, + unsigned int *count, unsigned int requested_planes, const unsigned int requested_sizes[]) { unsigned int num_planes = 0, num_buffers, allocated_buffers; unsigned plane_sizes[VB2_MAX_PLANES] = { }; - bool consistent_mem = true; int ret; - if (flags & V4L2_FLAG_MEMORY_NON_CONSISTENT) - consistent_mem = false; - if (q->num_buffers == VB2_MAX_FRAME) { dprintk(q, 1, "maximum number of buffers already allocated\n"); return -ENOBUFS; @@ -912,15 +881,12 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, } memset(q->alloc_devs, 0, sizeof(q->alloc_devs)); q->memory = memory; - set_queue_consistency(q, consistent_mem); q->waiting_for_buffers = !q->is_output; } else { if (q->memory != memory) { dprintk(q, 1, "memory model mismatch\n"); return -EINVAL; } - if (!verify_consistency_attr(q, consistent_mem)) - return -EINVAL; } num_buffers = min(*count, VB2_MAX_FRAME - q->num_buffers); @@ -2581,7 +2547,7 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read) fileio->memory = VB2_MEMORY_MMAP; fileio->type = q->type; q->fileio = fileio; - ret = vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); + ret = vb2_core_reqbufs(q, fileio->memory, &fileio->count); if (ret) goto err_kfree; @@ -2638,7 +2604,7 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read) err_reqbufs: fileio->count = 0; - vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); + vb2_core_reqbufs(q, fileio->memory, &fileio->count); err_kfree: q->fileio = NULL; @@ -2658,7 +2624,7 @@ static int __vb2_cleanup_fileio(struct vb2_queue *q) vb2_core_streamoff(q, q->type); q->fileio = NULL; fileio->count = 0; - vb2_core_reqbufs(q, fileio->memory, 0, &fileio->count); + vb2_core_reqbufs(q, fileio->memory, &fileio->count); kfree(fileio); dprintk(q, 3, "file io emulator closed\n"); } diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index ec3446cc45b8..7b1b86ec942d 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -42,11 +42,6 @@ struct vb2_dc_buf { struct dma_buf_attachment *db_attach; }; -static inline bool vb2_dc_buffer_consistent(unsigned long attr) -{ - return !(attr & DMA_ATTR_NON_CONSISTENT); -} - /*********************************************/ /* scatterlist table functions */ /*********************************************/ @@ -341,13 +336,6 @@ static int vb2_dc_dmabuf_ops_begin_cpu_access(struct dma_buf *dbuf, enum dma_data_direction direction) { - struct vb2_dc_buf *buf = dbuf->priv; - struct sg_table *sgt = buf->dma_sgt; - - if (vb2_dc_buffer_consistent(buf->attrs)) - return 0; - - dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir); return 0; } @@ -355,13 +343,6 @@ static int vb2_dc_dmabuf_ops_end_cpu_access(struct dma_buf *dbuf, enum dma_data_direction direction) { - struct vb2_dc_buf *buf = dbuf->priv; - struct sg_table *sgt = buf->dma_sgt; - - if (vb2_dc_buffer_consistent(buf->attrs)) - return 0; - - dma_sync_sg_for_device(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir); return 0; } diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 0a40e00f0d7e..a86fce5d8ea8 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -123,8 +123,7 @@ static void *vb2_dma_sg_alloc(struct device *dev, unsigned long dma_attrs, /* * NOTE: dma-sg allocates memory using the page allocator directly, so * there is no memory consistency guarantee, hence dma-sg ignores DMA - * attributes passed from the upper layer. That means that - * V4L2_FLAG_MEMORY_NON_CONSISTENT has no effect on dma-sg buffers. + * attributes passed from the upper layer. */ buf->pages = kvmalloc_array(buf->num_pages, sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index 30caad27281e..cfe197df970d 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -722,22 +722,12 @@ static void fill_buf_caps(struct vb2_queue *q, u32 *caps) #endif } -static void clear_consistency_attr(struct vb2_queue *q, - int memory, - unsigned int *flags) -{ - if (!q->allow_cache_hints || memory != V4L2_MEMORY_MMAP) - *flags &= ~V4L2_FLAG_MEMORY_NON_CONSISTENT; -} - int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req) { int ret = vb2_verify_memory_type(q, req->memory, req->type); fill_buf_caps(q, &req->capabilities); - clear_consistency_attr(q, req->memory, &req->flags); - return ret ? ret : vb2_core_reqbufs(q, req->memory, - req->flags, &req->count); + return ret ? ret : vb2_core_reqbufs(q, req->memory, &req->count); } EXPORT_SYMBOL_GPL(vb2_reqbufs); @@ -769,7 +759,6 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) unsigned i; fill_buf_caps(q, &create->capabilities); - clear_consistency_attr(q, create->memory, &create->flags); create->index = q->num_buffers; if (create->count == 0) return ret != -EBUSY ? ret : 0; @@ -813,7 +802,6 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create) if (requested_sizes[i] == 0) return -EINVAL; return ret ? ret : vb2_core_create_bufs(q, create->memory, - create->flags, &create->count, requested_planes, requested_sizes); @@ -998,12 +986,11 @@ int vb2_ioctl_reqbufs(struct file *file, void *priv, int res = vb2_verify_memory_type(vdev->queue, p->memory, p->type); fill_buf_caps(vdev->queue, &p->capabilities); - clear_consistency_attr(vdev->queue, p->memory, &p->flags); if (res) return res; if (vb2_queue_is_busy(vdev, file)) return -EBUSY; - res = vb2_core_reqbufs(vdev->queue, p->memory, p->flags, &p->count); + res = vb2_core_reqbufs(vdev->queue, p->memory, &p->count); /* If count == 0, then the owner has released all buffers and he is no longer owner of the queue. Otherwise we have a new owner. */ if (res == 0) @@ -1021,7 +1008,6 @@ int vb2_ioctl_create_bufs(struct file *file, void *priv, p->index = vdev->queue->num_buffers; fill_buf_caps(vdev->queue, &p->capabilities); - clear_consistency_attr(vdev->queue, p->memory, &p->flags); /* * If count == 0, then just check if memory and type are valid. * Any -EBUSY result from vb2_verify_memory_type can be mapped to 0. diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c index 959d110407a4..6974f1731529 100644 --- a/drivers/media/dvb-core/dvb_vb2.c +++ b/drivers/media/dvb-core/dvb_vb2.c @@ -342,7 +342,7 @@ int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req) ctx->buf_siz = req->size; ctx->buf_cnt = req->count; - ret = vb2_core_reqbufs(&ctx->vb_q, VB2_MEMORY_MMAP, 0, &req->count); + ret = vb2_core_reqbufs(&ctx->vb_q, VB2_MEMORY_MMAP, &req->count); if (ret) { ctx->state = DVB_VB2_STATE_NONE; dprintk(1, "[%s] count=%d size=%d errno=%d\n", ctx->name, diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index 593bcf6c3735..a99e82ec9ab6 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -246,9 +246,6 @@ struct v4l2_format32 { * @memory: buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. - * @flags: additional buffer management attributes (ignored unless the - * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability and - * configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers32 { @@ -257,8 +254,7 @@ struct v4l2_create_buffers32 { __u32 memory; /* enum v4l2_memory */ struct v4l2_format32 format; __u32 capabilities; - __u32 flags; - __u32 reserved[6]; + __u32 reserved[7]; }; static int __bufsize_v4l2_format(struct v4l2_format32 __user *p32, u32 *size) @@ -359,8 +355,7 @@ static int get_v4l2_create32(struct v4l2_create_buffers __user *p64, { if (!access_ok(p32, sizeof(*p32)) || copy_in_user(p64, p32, - offsetof(struct v4l2_create_buffers32, format)) || - assign_in_user(&p64->flags, &p32->flags)) + offsetof(struct v4l2_create_buffers32, format))) return -EFAULT; return __get_v4l2_format32(&p64->format, &p32->format, aux_buf, aux_space); @@ -422,7 +417,6 @@ static int put_v4l2_create32(struct v4l2_create_buffers __user *p64, copy_in_user(p32, p64, offsetof(struct v4l2_create_buffers32, format)) || assign_in_user(&p32->capabilities, &p64->capabilities) || - assign_in_user(&p32->flags, &p64->flags) || copy_in_user(p32->reserved, p64->reserved, sizeof(p64->reserved))) return -EFAULT; return __put_v4l2_format32(&p64->format, &p32->format); diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index e3a25ea913ac..ccf947632a3b 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -2044,6 +2044,9 @@ static int v4l_reqbufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; + + CLEAR_AFTER_FIELD(p, capabilities); + return ops->vidioc_reqbufs(file, fh, p); } @@ -2083,7 +2086,7 @@ static int v4l_create_bufs(const struct v4l2_ioctl_ops *ops, if (ret) return ret; - CLEAR_AFTER_FIELD(create, flags); + CLEAR_AFTER_FIELD(create, capabilities); v4l_sanitize_format(&create->format); diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h index 52ef92049073..bbb3f26fbde9 100644 --- a/include/media/videobuf2-core.h +++ b/include/media/videobuf2-core.h @@ -744,8 +744,6 @@ void vb2_core_querybuf(struct vb2_queue *q, unsigned int index, void *pb); * vb2_core_reqbufs() - Initiate streaming. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory type, as defined by &enum vb2_memory. - * @flags: auxiliary queue/buffer management flags. Currently, the only - * used flag is %V4L2_FLAG_MEMORY_NON_CONSISTENT. * @count: requested buffer count. * * Videobuf2 core helper to implement VIDIOC_REQBUF() operation. It is called @@ -770,13 +768,12 @@ void vb2_core_querybuf(struct vb2_queue *q, unsigned int index, void *pb); * Return: returns zero on success; an error code otherwise. */ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count); + unsigned int *count); /** * vb2_core_create_bufs() - Allocate buffers and any required auxiliary structs * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory type, as defined by &enum vb2_memory. - * @flags: auxiliary queue/buffer management flags. * @count: requested buffer count. * @requested_planes: number of planes requested. * @requested_sizes: array with the size of the planes. @@ -794,7 +791,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, * Return: returns zero on success; an error code otherwise. */ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, - unsigned int flags, unsigned int *count, + unsigned int *count, unsigned int requested_planes, const unsigned int requested_sizes[]); diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c7b70ff53bc1..235db7754606 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -191,8 +191,6 @@ enum v4l2_memory { V4L2_MEMORY_DMABUF = 4, }; -#define V4L2_FLAG_MEMORY_NON_CONSISTENT (1 << 0) - /* see also http://vektor.theorem.ca/graphics/ycbcr/ */ enum v4l2_colorspace { /* @@ -949,10 +947,7 @@ struct v4l2_requestbuffers { __u32 type; /* enum v4l2_buf_type */ __u32 memory; /* enum v4l2_memory */ __u32 capabilities; - union { - __u32 flags; - __u32 reserved[1]; - }; + __u32 reserved[1]; }; /* capabilities for struct v4l2_requestbuffers and v4l2_create_buffers */ @@ -2456,9 +2451,6 @@ struct v4l2_dbg_chip_info { * @memory: enum v4l2_memory; buffer memory type * @format: frame format, for which buffers are requested * @capabilities: capabilities of this buffer type. - * @flags: additional buffer management attributes (ignored unless the - * queue has V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS capability - * and configured for MMAP streaming I/O). * @reserved: future extensions */ struct v4l2_create_buffers { @@ -2467,8 +2459,7 @@ struct v4l2_create_buffers { __u32 memory; struct v4l2_format format; __u32 capabilities; - __u32 flags; - __u32 reserved[6]; + __u32 reserved[7]; }; /* -- cgit v1.2.3 From 1869e226a7b3ef75b4f70ede2f1b7229f7157fa4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 13 Sep 2020 12:43:39 -0600 Subject: ipv4: Initialize flowi4_multipath_hash in data path flowi4_multipath_hash was added by the commit referenced below for tunnels. Unfortunately, the patch did not initialize the new field for several fast path lookups that do not initialize the entire flow struct to 0. Fix those locations. Currently, flowi4_multipath_hash is random garbage and affects the hash value computed by fib_multipath_hash for multipath selection. Fixes: 24ba14406c5c ("route: Add multipath_hash in flowi_common to make user-define hash") Signed-off-by: David Ahern Cc: wenxu Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/core/filter.c | 1 + net/ipv4/fib_frontend.c | 1 + net/ipv4/route.c | 1 + 4 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 929d3ca614d0..b2531df3f65f 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -116,6 +116,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; + fl4->flowi4_multipath_hash = 0; } /* Reset some input parameters after previous lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index 1f647ab986b6..1b168371ba96 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4838,6 +4838,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; + fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 41079490a118..86a23e4a6a50 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -362,6 +362,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8ca6bcab7b03..e5f210d00851 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2147,6 +2147,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; -- cgit v1.2.3 From 13e6ce98aa65ab5ce19351c020419360dfe8af29 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 13 Sep 2020 19:51:50 +0800 Subject: net: sched: only keep the available bits when setting vxlan md->gbp As we can see from vxlan_build/parse_gbp_hdr(), when processing metadata on vxlan rx/tx path, only dont_learn/policy_applied/policy_id fields can be set to or parse from the packet for vxlan gbp option. So we'd better do the mask when set it in act_tunnel_key and cls_flower. Otherwise, when users don't know these bits, they may configure with a value which can never be matched. Reported-by: Shuang Li Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/vxlan.h | 3 +++ net/sched/act_tunnel_key.c | 1 + net/sched/cls_flower.c | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 3a41627cbdfe..08537aa14f7c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -121,6 +121,9 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) +#define VXLAN_GBP_MASK (VXLAN_GBP_DONT_LEARN | VXLAN_GBP_POLICY_APPLIED | \ + VXLAN_GBP_ID_MASK) + /* * VXLAN Generic Protocol Extension (VXLAN_F_GPE): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 536c4bc31be6..37f1e10f35e0 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -156,6 +156,7 @@ tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, struct vxlan_metadata *md = dst; md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); + md->gbp &= VXLAN_GBP_MASK; } return sizeof(struct vxlan_metadata); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index a4f7ef1de7e7..e8fda1bd4d9d 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1175,8 +1175,10 @@ static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key, return -EINVAL; } - if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) + if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) { md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]); + md->gbp &= VXLAN_GBP_MASK; + } return sizeof(*md); } -- cgit v1.2.3 From 27ba3e8ff3ab86449e63d38a8d623053591e65fa Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 15 Sep 2020 16:33:46 +0900 Subject: scsi: sd: sd_zbc: Fix handling of host-aware ZBC disks When CONFIG_BLK_DEV_ZONED is disabled, allow using host-aware ZBC disks as regular disks. In this case, ensure that command completion is correctly executed by changing sd_zbc_complete() to return good_bytes instead of 0 and causing a hang during device probe (endless retries). When CONFIG_BLK_DEV_ZONED is enabled and a host-aware disk is detected to have partitions, it will be used as a regular disk. In this case, make sure to not do anything in sd_zbc_revalidate_zones() as that triggers warnings. Since all these different cases result in subtle settings of the disk queue zoned model, introduce the block layer helper function blk_queue_set_zoned() to generically implement setting up the effective zoned model according to the disk type, the presence of partitions on the disk and CONFIG_BLK_DEV_ZONED configuration. Link: https://lore.kernel.org/r/20200915073347.832424-2-damien.lemoal@wdc.com Fixes: b72053072c0b ("block: allow partitions on host aware zone devices") Cc: Reported-by: Borislav Petkov Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- block/blk-settings.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/scsi/sd.c | 30 ++++++++++++++++++------------ drivers/scsi/sd.h | 2 +- drivers/scsi/sd_zbc.c | 6 +++++- include/linux/blkdev.h | 2 ++ 5 files changed, 72 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index 76a7e03bcd6c..34b721a2743a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -801,6 +801,52 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); +/** + * blk_queue_set_zoned - configure a disk queue zoned model. + * @disk: the gendisk of the queue to configure + * @model: the zoned model to set + * + * Set the zoned model of the request queue of @disk according to @model. + * When @model is BLK_ZONED_HM (host managed), this should be called only + * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). + * If @model specifies BLK_ZONED_HA (host aware), the effective model used + * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions + * on the disk. + */ +void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) +{ + switch (model) { + case BLK_ZONED_HM: + /* + * Host managed devices are supported only if + * CONFIG_BLK_DEV_ZONED is enabled. + */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); + break; + case BLK_ZONED_HA: + /* + * Host aware devices can be treated either as regular block + * devices (similar to drive managed devices) or as zoned block + * devices to take advantage of the zone command set, similarly + * to host managed devices. We try the latter if there are no + * partitions and zoned block device support is enabled, else + * we do nothing special as far as the block layer is concerned. + */ + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || + disk_has_partitions(disk)) + model = BLK_ZONED_NONE; + break; + case BLK_ZONED_NONE: + default: + if (WARN_ON_ONCE(model != BLK_ZONED_NONE)) + model = BLK_ZONED_NONE; + break; + } + + disk->queue->limits.zoned = model; +} +EXPORT_SYMBOL_GPL(blk_queue_set_zoned); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 95018e650f2d..06286b6aeaec 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2964,26 +2964,32 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) if (sdkp->device->type == TYPE_ZBC) { /* Host-managed */ - q->limits.zoned = BLK_ZONED_HM; + blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HM); } else { sdkp->zoned = (buffer[8] >> 4) & 3; - if (sdkp->zoned == 1 && !disk_has_partitions(sdkp->disk)) { + if (sdkp->zoned == 1) { /* Host-aware */ - q->limits.zoned = BLK_ZONED_HA; + blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HA); } else { - /* - * Treat drive-managed devices and host-aware devices - * with partitions as regular block devices. - */ - q->limits.zoned = BLK_ZONED_NONE; - if (sdkp->zoned == 2 && sdkp->first_scan) - sd_printk(KERN_NOTICE, sdkp, - "Drive-managed SMR disk\n"); + /* Regular disk or drive managed disk */ + blk_queue_set_zoned(sdkp->disk, BLK_ZONED_NONE); } } - if (blk_queue_is_zoned(q) && sdkp->first_scan) + + if (!sdkp->first_scan) + goto out; + + if (blk_queue_is_zoned(q)) { sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n", q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware"); + } else { + if (sdkp->zoned == 1) + sd_printk(KERN_NOTICE, sdkp, + "Host-aware SMR disk used as regular disk\n"); + else if (sdkp->zoned == 2) + sd_printk(KERN_NOTICE, sdkp, + "Drive-managed SMR disk\n"); + } out: kfree(buffer); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 4933e7daf17d..7251434100e6 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -259,7 +259,7 @@ static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { - return 0; + return good_bytes; } static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 0e94ff056bff..a739456dea02 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -667,7 +667,11 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) u32 max_append; int ret = 0; - if (!sd_is_zoned(sdkp)) + /* + * There is nothing to do for regular disks, including host-aware disks + * that have partitions. + */ + if (!blk_queue_is_zoned(q)) return 0; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb5636cc17b9..868e11face00 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -352,6 +352,8 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); +void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model); + #ifdef CONFIG_BLK_DEV_ZONED #define BLK_ALL_ZONES ((unsigned int)-1) -- cgit v1.2.3 From ffbc3dd1975f1e2dac7b1752aa8b5cac3cd5b459 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 16 Sep 2020 23:18:43 +0300 Subject: fs: fix cast in fsparam_u32hex() macro Signed-off-by: Alexey Dobriyan Signed-off-by: Al Viro --- include/linux/fs_parser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 2eab6d5f6736..aab0ffc6bac6 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -120,7 +120,7 @@ static inline bool fs_validate_description(const char *name, #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) #define fsparam_u32hex(NAME, OPT) \ - __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *16)) + __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) -- cgit v1.2.3 From 19a83d36f9837e8bd27435ebb31564a717a5d15a Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 17 Sep 2020 01:04:10 +0200 Subject: ethtool: add and use message type for tunnel info reply Tunnel offload info code uses ETHTOOL_MSG_TUNNEL_INFO_GET message type (cmd field in genetlink header) for replies to tunnel info netlink request, i.e. the same value as the request have. This is a problem because we are using two separate enums for userspace to kernel and kernel to userspace message types so that this ETHTOOL_MSG_TUNNEL_INFO_GET (28) collides with ETHTOOL_MSG_CABLE_TEST_TDR_NTF which is what message type 28 means for kernel to userspace messages. As the tunnel info request reached mainline in 5.9 merge window, we should still be able to fix the reply message type without breaking backward compatibility. Fixes: c7d759eb7b12 ("ethtool: add tunnel info interface") Signed-off-by: Michal Kubecek Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 3 +++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/tunnels.c | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d53bcb31645a..b5a79881551f 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -206,6 +206,7 @@ Userspace to kernel: ``ETHTOOL_MSG_TSINFO_GET`` get timestamping info ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test + ``ETHTOOL_MSG_TUNNEL_INFO_GET`` get tunnel offload info ===================================== ================================ Kernel to userspace: @@ -239,6 +240,7 @@ Kernel to userspace: ``ETHTOOL_MSG_TSINFO_GET_REPLY`` timestamping info ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results + ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1363,4 +1365,5 @@ are netlink only. ``ETHTOOL_SFECPARAM`` n/a n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' + n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET`` =================================== ===================================== diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5dcd24cb33ea..72ba36be9655 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -79,6 +79,7 @@ enum { ETHTOOL_MSG_TSINFO_GET_REPLY, ETHTOOL_MSG_CABLE_TEST_NTF, ETHTOOL_MSG_CABLE_TEST_TDR_NTF, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index 84f23289475b..d93bf2da0f34 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -200,7 +200,7 @@ int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, req_info.dev, - ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, ETHTOOL_A_TUNNEL_INFO_HEADER, info, &reply_payload); if (!rskb) { @@ -273,7 +273,7 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) goto cont; ehdr = ethnl_dump_put(skb, cb, - ETHTOOL_MSG_TUNNEL_INFO_GET); + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY); if (!ehdr) { ret = -EMSGSIZE; goto out; -- cgit v1.2.3 From b9df46d08a8d098ea2124cb9e3b84458a474b4d4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Sep 2020 09:19:43 -0400 Subject: pNFS/flexfiles: Be consistent about mirror index types A mirror index is always of type u32. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 34 +++++++++++++++++----------------- include/linux/nfs_xdr.h | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1edeebd51937..a163533446fa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -715,7 +715,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -724,7 +724,7 @@ ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -734,14 +734,14 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx, + u32 start_idx, u32 *best_idx, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; bool fail_return = false; - int idx; + u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { @@ -766,21 +766,21 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, int *best_idx) + u32 start_idx, u32 *best_idx) { struct nfs4_pnfs_ds *ds; @@ -791,7 +791,8 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static struct nfs4_pnfs_ds * -ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx) +ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, + u32 *best_idx) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; @@ -837,8 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - int ds_idx; - u32 i; + u32 ds_idx, i; retry: ff_layout_pg_check_layout(pgio, req); @@ -895,7 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - int i; + u32 i; retry: ff_layout_pg_check_layout(pgio, req); @@ -1039,7 +1039,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; - int new_idx = 0; + u32 new_idx = 0; if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) ff_layout_send_layouterror(hdr->lseg); @@ -1076,7 +1076,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; @@ -1150,7 +1150,7 @@ reset: /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1185,7 +1185,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - int idx) + u32 idx) { int vers = clp->cl_nfs_mod->rpc_vers->number; @@ -1212,7 +1212,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - int idx, u64 offset, u64 length, + u32 idx, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1810,7 +1810,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; - int idx = hdr->pgio_mirror_idx; + u32 idx = hdr->pgio_mirror_idx; mirror = FF_LAYOUT_COMP(lseg, idx); ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9408f3252c8e..69cb46f7b8d2 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1611,8 +1611,8 @@ struct nfs_pgio_header { __u64 mds_offset; /* Filelayout dense stripe */ struct nfs_page_array page_array; struct nfs_client *ds_clp; /* pNFS data server */ - int ds_commit_idx; /* ds index if ds_clp is set */ - int pgio_mirror_idx;/* mirror index in pgio layer */ + u32 ds_commit_idx; /* ds index if ds_clp is set */ + u32 pgio_mirror_idx;/* mirror index in pgio layer */ }; struct nfs_mds_commit_info { -- cgit v1.2.3 From 54fa9ba564b717fc2cf689427e195c360315999d Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 7 Sep 2020 11:32:07 +0200 Subject: ftrace: Let ftrace_enable_sysctl take a kernel pointer buffer Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") changed ctl_table.proc_handler to take a kernel pointer. Adjust the signature of ftrace_enable_sysctl to match ctl_table.proc_handler which fixes the following sparse warning: kernel/trace/ftrace.c:7544:43: warning: incorrect type in argument 3 (different address spaces) kernel/trace/ftrace.c:7544:43: expected void * kernel/trace/ftrace.c:7544:43: got void [noderef] __user *buffer Link: https://lkml.kernel.org/r/20200907093207.13540-1-tklauser@distanz.ch Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Cc: Andrew Morton Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Tobias Klauser Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ce2c06f72e86..e5c2d5cc6e6a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -85,8 +85,7 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val extern int ftrace_enabled; extern int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); struct ftrace_ops; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e9f893388245..603255f5f085 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7534,8 +7534,7 @@ static bool is_permanent_ops_registered(void) int ftrace_enable_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = -ENODEV; -- cgit v1.2.3 From 82d083ab60c3693201c6f5c7a5f23a6ed422098d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2020 17:55:05 +0900 Subject: kprobes: tracing/kprobes: Fix to kill kprobes on initmem after boot Since kprobe_event= cmdline option allows user to put kprobes on the functions in initmem, kprobe has to make such probes gone after boot. Currently the probes on the init functions in modules will be handled by module callback, but the kernel init text isn't handled. Without this, kprobes may access non-exist text area to disable or remove it. Link: https://lkml.kernel.org/r/159972810544.428528.1839307531600646955.stgit@devnote2 Fixes: 970988e19eb0 ("tracing/kprobe: Add kprobe_event= boot parameter") Cc: Jonathan Corbet Cc: Shuah Khan Cc: Randy Dunlap Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 5 +++++ init/main.c | 2 ++ kernel/kprobes.c | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9be1bff4f586..8aab327b5539 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -373,6 +373,8 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); +void kprobe_free_init_mem(void); + int disable_kprobe(struct kprobe *kp); int enable_kprobe(struct kprobe *kp); @@ -435,6 +437,9 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } +static inline void kprobe_free_init_mem(void) +{ +} static inline int disable_kprobe(struct kprobe *kp) { return -ENOSYS; diff --git a/init/main.c b/init/main.c index ae78fb68d231..038128b2a755 100644 --- a/init/main.c +++ b/init/main.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1402,6 +1403,7 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + kprobe_free_init_mem(); ftrace_free_init_mem(); free_initmem(); mark_readonly(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d43b48ecdb4f..d750e025d1ff 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2453,6 +2453,28 @@ static struct notifier_block kprobe_module_nb = { extern unsigned long __start_kprobe_blacklist[]; extern unsigned long __stop_kprobe_blacklist[]; +void kprobe_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + struct hlist_head *head; + struct kprobe *p; + int i; + + mutex_lock(&kprobe_mutex); + + /* Kill all kprobes on initmem */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry(p, head, hlist) { + if (start <= (void *)p->addr && (void *)p->addr < end) + kill_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); +} + static int __init init_kprobes(void) { int i, err = 0; -- cgit v1.2.3 From 6565243c0677aa2befa5a953cf11bc7b4a6f0a47 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 18 Sep 2020 04:07:24 +0300 Subject: net: mscc: ocelot: add locking for the port TX timestamp ID The ocelot_port->ts_id is used to: (a) populate skb->cb[0] for matching the TX timestamp in the PTP IRQ with an skb. (b) populate the REW_OP from the injection header of the ongoing skb. Only then is ocelot_port->ts_id incremented. This is a problem because, at least theoretically, another timestampable skb might use the same ocelot_port->ts_id before that is incremented. Normally all transmit calls are serialized by the netdev transmit spinlock, but in this case, ocelot_port_add_txtstamp_skb() is also called by DSA, which has started declaring the NETIF_F_LLTX feature since commit 2b86cb829976 ("net: dsa: declare lockless TX feature for slave ports"). So the logic of using and incrementing the timestamp id should be atomic per port. The solution is to use the global ocelot_port->ts_id only while protected by the associated ocelot_port->ts_id_lock. That's where we populate skb->cb[0]. Note that for ocelot, ocelot_port_add_txtstamp_skb is called for the actual skb, but for felix, it is called for the skb's clone. That is something which will also be changed in the future. Signed-off-by: Vladimir Oltean Reviewed-by: Horatiu Vultur Reviewed-by: Florian Fainelli Tested-by: Alexandre Belloni Reviewed-by: Alexandre Belloni Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 8 +++++++- drivers/net/ethernet/mscc/ocelot_net.c | 6 ++---- include/soc/mscc/ocelot.h | 1 + net/dsa/tag_ocelot.c | 11 +++++++---- 4 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 5abb7d2b0a9e..83eb7c325061 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -421,10 +421,15 @@ int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP && ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + spin_lock(&ocelot_port->ts_id_lock); + shinfo->tx_flags |= SKBTX_IN_PROGRESS; /* Store timestamp ID in cb[0] of sk_buff */ - skb->cb[0] = ocelot_port->ts_id % 4; + skb->cb[0] = ocelot_port->ts_id; + ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4; skb_queue_tail(&ocelot_port->tx_skbs, skb); + + spin_unlock(&ocelot_port->ts_id_lock); return 0; } return -ENODATA; @@ -1300,6 +1305,7 @@ void ocelot_init_port(struct ocelot *ocelot, int port) struct ocelot_port *ocelot_port = ocelot->ports[port]; skb_queue_head_init(&ocelot_port->tx_skbs); + spin_lock_init(&ocelot_port->ts_id_lock); /* Basic L2 initialization */ diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index cacabc23215a..8490e42e9e2d 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -349,10 +349,8 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP) { info.rew_op = ocelot_port->ptp_cmd; - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { - info.rew_op |= (ocelot_port->ts_id % 4) << 3; - ocelot_port->ts_id++; - } + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + info.rew_op |= skb->cb[0] << 3; } ocelot_gen_ifh(ifh, &info); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index da369b12005f..4521dd602ddc 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -566,6 +566,7 @@ struct ocelot_port { u8 ptp_cmd; struct sk_buff_head tx_skbs; u8 ts_id; + spinlock_t ts_id_lock; phy_interface_t phy_mode; diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 42f327c06dca..b4fc05cafaa6 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -160,11 +160,14 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, packing(injection, &qos_class, 19, 17, OCELOT_TAG_LEN, PACK, 0); if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { + struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + rew_op = ocelot_port->ptp_cmd; - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { - rew_op |= (ocelot_port->ts_id % 4) << 3; - ocelot_port->ts_id++; - } + /* Retrieve timestamp ID populated inside skb->cb[0] of the + * clone by ocelot_port_add_txtstamp_skb + */ + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) + rew_op |= clone->cb[0] << 3; packing(injection, &rew_op, 125, 117, OCELOT_TAG_LEN, PACK, 0); } -- cgit v1.2.3 From e5fb512d81d021b7c7a0c2547c3dafb9de759285 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 18 Sep 2020 04:07:30 +0300 Subject: net: mscc: ocelot: deinitialize only initialized ports Currently mscc_ocelot_init_ports() will skip initializing a port when it doesn't have a phy-handle, so the ocelot->ports[port] pointer will be NULL. Take this into consideration when tearing down the driver, and add a new function ocelot_deinit_port() to the switch library, mirror of ocelot_init_port(), which needs to be called by the driver for all ports it has initialized. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Tested-by: Alexandre Belloni Reviewed-by: Alexandre Belloni Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 3 +++ drivers/net/ethernet/mscc/ocelot.c | 16 ++++++++-------- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 2 ++ include/soc/mscc/ocelot.h | 1 + 4 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 66da447c4096..01427cd08448 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -643,10 +643,13 @@ static void felix_teardown(struct dsa_switch *ds) { struct ocelot *ocelot = ds->priv; struct felix *felix = ocelot_to_felix(ocelot); + int port; if (felix->info->mdio_bus_free) felix->info->mdio_bus_free(ocelot); + for (port = 0; port < ocelot->num_phys_ports; port++) + ocelot_deinit_port(ocelot, port); ocelot_deinit_timestamp(ocelot); /* stop workqueue thread */ ocelot_deinit(ocelot); diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 83eb7c325061..8518e1d60da4 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1550,18 +1550,18 @@ EXPORT_SYMBOL(ocelot_init); void ocelot_deinit(struct ocelot *ocelot) { - struct ocelot_port *port; - int i; - cancel_delayed_work(&ocelot->stats_work); destroy_workqueue(ocelot->stats_queue); mutex_destroy(&ocelot->stats_lock); - - for (i = 0; i < ocelot->num_phys_ports; i++) { - port = ocelot->ports[i]; - skb_queue_purge(&port->tx_skbs); - } } EXPORT_SYMBOL(ocelot_deinit); +void ocelot_deinit_port(struct ocelot *ocelot, int port) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + skb_queue_purge(&ocelot_port->tx_skbs); +} +EXPORT_SYMBOL(ocelot_deinit_port); + MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 252c49b5f22b..e02fb8bfab63 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -908,6 +908,8 @@ static void mscc_ocelot_release_ports(struct ocelot *ocelot) if (!ocelot_port) continue; + ocelot_deinit_port(ocelot, port); + priv = container_of(ocelot_port, struct ocelot_port_private, port); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 4521dd602ddc..0ac4e7fba086 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -678,6 +678,7 @@ void ocelot_configure_cpu(struct ocelot *ocelot, int npi, int ocelot_init(struct ocelot *ocelot); void ocelot_deinit(struct ocelot *ocelot); void ocelot_init_port(struct ocelot *ocelot, int port); +void ocelot_deinit_port(struct ocelot *ocelot, int port); /* DSA callbacks */ void ocelot_port_enable(struct ocelot *ocelot, int port, -- cgit v1.2.3 From fe81d9f6182d1160e625894eecb3d7ff0222cac5 Mon Sep 17 00:00:00 2001 From: Henry Ptasinski Date: Sat, 19 Sep 2020 00:12:11 +0000 Subject: net: sctp: Fix IPv6 ancestor_size calc in sctp_copy_descendant When calculating ancestor_size with IPv6 enabled, simply using sizeof(struct ipv6_pinfo) doesn't account for extra bytes needed for alignment in the struct sctp6_sock. On x86, there aren't any extra bytes, but on ARM the ipv6_pinfo structure is aligned on an 8-byte boundary so there were 4 pad bytes that were omitted from the ancestor_size calculation. This would lead to corruption of the pd_lobby pointers, causing an oops when trying to free the sctp structure on socket close. Fixes: 636d25d557d1 ("sctp: not copy sctp_sock pd_lobby in sctp_copy_descendant") Signed-off-by: Henry Ptasinski Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 8 +++++--- net/sctp/socket.c | 9 +++------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index b33f1aefad09..0bdff38eb4bb 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -226,12 +226,14 @@ struct sctp_sock { data_ready_signalled:1; atomic_t pd_mode; + + /* Fields after this point will be skipped on copies, like on accept + * and peeloff operations + */ + /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; - /* These must be the last fields, as they will skipped on copies, - * like on accept and peeloff operations - */ struct list_head auto_asconf_list; int do_auto_asconf; }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 836615f71a7d..53d0a4161df3 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9220,13 +9220,10 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, static inline void sctp_copy_descendant(struct sock *sk_to, const struct sock *sk_from) { - int ancestor_size = sizeof(struct inet_sock) + - sizeof(struct sctp_sock) - - offsetof(struct sctp_sock, pd_lobby); - - if (sk_from->sk_family == PF_INET6) - ancestor_size += sizeof(struct ipv6_pinfo); + size_t ancestor_size = sizeof(struct inet_sock); + ancestor_size += sk_from->sk_prot->obj_size; + ancestor_size -= offsetof(struct sctp_sock, pd_lobby); __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); } -- cgit v1.2.3 From 3aab91774bbd8e571cfaddaf839aafd07718333c Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Fri, 25 Sep 2020 14:00:31 +0800 Subject: block: remove unused BLK_QC_T_EAGAIN flag commit 7b6620d7db56 ("block: remove REQ_NOWAIT_INLINE") removed the REQ_NOWAIT_INLINE related code, but the diff wasn't applied to blk_types.h somehow. Then commit 2771cefeac49 ("block: remove the REQ_NOWAIT_INLINE flag") removed the REQ_NOWAIT_INLINE flag while the BLK_QC_T_EAGAIN flag still remains. Fixes: 7b6620d7db56 ("block: remove REQ_NOWAIT_INLINE") Signed-off-by: Jeffle Xu Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4ecf4fed171f..b3fc5d3dd8ea 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -497,13 +497,12 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U -#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; + return cookie != BLK_QC_T_NONE; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -- cgit v1.2.3 From d3f7b1bb204099f2f7306318896223e8599bb6a2 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 25 Sep 2020 21:19:10 -0700 Subject: mm/gup: fix gup_fast with dynamic page table folding Currently to make sure that every page table entry is read just once gup_fast walks perform READ_ONCE and pass pXd value down to the next gup_pXd_range function by value e.g.: static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) ... pudp = pud_offset(&p4d, addr); This function passes a reference on that local value copy to pXd_offset, and might get the very same pointer in return. This happens when the level is folded (on most arches), and that pointer should not be iterated. On s390 due to the fact that each task might have different 5,4 or 3-level address translation and hence different levels folded the logic is more complex and non-iteratable pointer to a local copy leads to severe problems. Here is an example of what happens with gup_fast on s390, for a task with 3-level paging, crossing a 2 GB pud boundary: // addr = 0x1007ffff000, end = 0x10080001000 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; // pud_offset returns &p4d itself (a pointer to a value on stack) pudp = pud_offset(&p4d, addr); do { // on second iteratation reading "random" stack value pud_t pud = READ_ONCE(*pudp); // next = 0x10080000000, due to PUD_SIZE/MASK != PGDIR_SIZE/MASK on s390 next = pud_addr_end(addr, end); ... } while (pudp++, addr = next, addr != end); // pudp++ iterating over stack return 1; } This happens since s390 moved to common gup code with commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") and commit 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code"). s390 tried to mimic static level folding by changing pXd_offset primitives to always calculate top level page table offset in pgd_offset and just return the value passed when pXd_offset has to act as folded. What is crucial for gup_fast and what has been overlooked is that PxD_SIZE/MASK and thus pXd_addr_end should also change correspondingly. And the latter is not possible with dynamic folding. To fix the issue in addition to pXd values pass original pXdp pointers down to gup_pXd_range functions. And introduce pXd_offset_lockless helpers, which take an additional pXd entry value parameter. This has already been discussed in https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1 Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code") Signed-off-by: Vasily Gorbik Signed-off-by: Andrew Morton Reviewed-by: Gerald Schaefer Reviewed-by: Alexander Gordeev Reviewed-by: Jason Gunthorpe Reviewed-by: Mike Rapoport Reviewed-by: John Hubbard Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Dave Hansen Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jeff Dike Cc: Richard Weinberger Cc: Dave Hansen Cc: Andy Lutomirski Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Arnd Bergmann Cc: Andrey Ryabinin Cc: Heiko Carstens Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: [5.2+] Link: https://lkml.kernel.org/r/patch.git-943f1e5dcff2.your-ad-here.call-01599856292-ext-8676@work.hours Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 42 +++++++++++++++++++++++++++++------------ include/linux/pgtable.h | 10 ++++++++++ mm/gup.c | 18 +++++++++--------- 3 files changed, 49 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7eb01a5459cd..b55561cc8786 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long address) { - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); - return (p4d_t *) pgd; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(pgd) + p4d_index(address); + return (p4d_t *) pgdp; } +#define p4d_offset_lockless p4d_offset_lockless -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address) { - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) - return (pud_t *) p4d_deref(*p4d) + pud_index(address); - return (pud_t *) p4d; + return p4d_offset_lockless(pgdp, *pgdp, address); +} + +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long address) +{ + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(p4d) + pud_index(address); + return (pud_t *) p4dp; +} +#define pud_offset_lockless pud_offset_lockless + +static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address) +{ + return pud_offset_lockless(p4dp, *p4dp, address); } #define pud_offset pud_offset -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +static inline pmd_t *pmd_offset_lockless(pud_t *pudp, pud_t pud, unsigned long address) +{ + if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) + return (pmd_t *) pud_deref(pud) + pmd_index(address); + return (pmd_t *) pudp; +} +#define pmd_offset_lockless pmd_offset_lockless + +static inline pmd_t *pmd_offset(pud_t *pudp, unsigned long address) { - if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) - return (pmd_t *) pud_deref(*pud) + pmd_index(address); - return (pmd_t *) pud; + return pmd_offset_lockless(pudp, *pudp, address); } #define pmd_offset pmd_offset diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e8cbc2e795d5..90654cb63e9e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,6 +1427,16 @@ typedef unsigned int pgtbl_mod_mask; #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif +#ifndef p4d_offset_lockless +#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) +#endif +#ifndef pud_offset_lockless +#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) +#endif +#ifndef pmd_offset_lockless +#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) +#endif + /* * p?d_leaf() - true if this entry is a final mapping to a physical address. * This differs from p?d_huge() by the fact that they are always available (if diff --git a/mm/gup.c b/mm/gup.c index e5739a1974d5..578bf5bd8bf8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2485,13 +2485,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 1; } -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, +static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; - pmdp = pmd_offset(&pud, addr); + pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = READ_ONCE(*pmdp); @@ -2528,13 +2528,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } -static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&p4d, addr); + pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); @@ -2549,20 +2549,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) + } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } -static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; - p4dp = p4d_offset(&pgd, addr); + p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); @@ -2574,7 +2574,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) + } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -2602,7 +2602,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) + } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } -- cgit v1.2.3 From c1d0da83358a2316d9be7f229f26126dbaa07468 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 25 Sep 2020 21:19:28 -0700 Subject: mm: replace memmap_context by meminit_context Patch series "mm: fix memory to node bad links in sysfs", v3. Sometimes, firmware may expose interleaved memory layout like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] In that case, we can see memory blocks assigned to multiple nodes in sysfs: $ ls -l /sys/devices/system/memory/memory21 total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 -rw-r--r-- 1 root root 65536 Aug 24 05:27 online -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_device -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_index drwxr-xr-x 2 root root 0 Aug 24 05:27 power -r--r--r-- 1 root root 65536 Aug 24 05:27 removable -rw-r--r-- 1 root root 65536 Aug 24 05:27 state lrwxrwxrwx 1 root root 0 Aug 24 05:25 subsystem -> ../../../../bus/memory -rw-r--r-- 1 root root 65536 Aug 24 05:25 uevent -r--r--r-- 1 root root 65536 Aug 24 05:27 valid_zones The same applies in the node's directory with a memory21 link in both the node1 and node2's directory. This is wrong but doesn't prevent the system to run. However when later, one of these memory blocks is hot-unplugged and then hot-plugged, the system is detecting an inconsistency in the sysfs layout and a BUG_ON() is raised: kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This has been seen on PowerPC LPAR. The root cause of this issue is that when node's memory is registered, the range used can overlap another node's range, thus the memory block is registered to multiple nodes in sysfs. There are two issues here: (a) The sysfs memory and node's layouts are broken due to these multiple links (b) The link errors in link_mem_sections() should not lead to a system panic. To address (a) register_mem_sect_under_node should not rely on the system state to detect whether the link operation is triggered by a hot plug operation or not. This is addressed by the patches 1 and 2 of this series. Issue (b) will be addressed separately. This patch (of 2): The memmap_context enum is used to detect whether a memory operation is due to a hot-add operation or happening at boot time. Make it general to the hotplug operation and rename it as meminit_context. There is no functional change introduced by this patch Suggested-by: David Hildenbrand Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J . Wysocki" Cc: Nathan Lynch Cc: Scott Cheloha Cc: Tony Luck Cc: Fenghua Yu Cc: Link: https://lkml.kernel.org/r/20200915094143.79181-1-ldufour@linux.ibm.com Link: https://lkml.kernel.org/r/20200915132624.9723-1-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 6 +++--- include/linux/mm.h | 2 +- include/linux/mmzone.h | 11 ++++++++--- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 10 +++++----- 5 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 0b3fb4c7af29..8e7b8c6c576e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -538,7 +538,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMMAP_EARLY, NULL); + MEMINIT_EARLY, NULL); return 0; } @@ -547,8 +547,8 @@ memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, - NULL); + memmap_init_zone(size, nid, zone, start_pfn, + MEMINIT_EARLY, NULL); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/include/linux/mm.h b/include/linux/mm.h index b2f370f0b420..48614513eb66 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2416,7 +2416,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum memmap_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8379432f4f2f..0f7a4ff4b059 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -824,10 +824,15 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); -enum memmap_context { - MEMMAP_EARLY, - MEMMAP_HOTPLUG, +/* + * Memory initialization context, use to differentiate memory added by + * the platform statically or via memory hotplug interface. + */ +enum meminit_context { + MEMINIT_EARLY, + MEMINIT_HOTPLUG, }; + extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b11a269e2356..345308a8bd15 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -729,7 +729,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMMAP_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap); set_zone_contiguous(zone); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fab5e97dc9ca..5661fa164f13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5975,7 +5975,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum memmap_context context, + unsigned long start_pfn, enum meminit_context context, struct vmem_altmap *altmap) { unsigned long pfn, end_pfn = start_pfn + size; @@ -6007,7 +6007,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { + if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6016,7 +6016,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMMAP_HOTPLUG) + if (context == MEMINIT_HOTPLUG) __SetPageReserved(page); /* @@ -6099,7 +6099,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { @@ -6137,7 +6137,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; memmap_init_zone(size, nid, zone, start_pfn, - MEMMAP_EARLY, NULL); + MEMINIT_EARLY, NULL); } } } -- cgit v1.2.3 From f85086f95fa36194eb0db5cd5c12e56801b98523 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Fri, 25 Sep 2020 21:19:31 -0700 Subject: mm: don't rely on system state to detect hot-plug operations In register_mem_sect_under_node() the system_state's value is checked to detect whether the call is made during boot time or during an hot-plug operation. Unfortunately, that check against SYSTEM_BOOTING is wrong because regular memory is registered at SYSTEM_SCHEDULING state. In addition, memory hot-plug operation can be triggered at this system state by the ACPI [1]. So checking against the system state is not enough. The consequence is that on system with interleaved node's ranges like this: Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff] This can be seen on PowerPC LPAR after multiple memory hot-plug and hot-unplug operations are done. At the next reboot the node's memory ranges can be interleaved and since the call to link_mem_sections() is made in topology_init() while the system is in the SYSTEM_SCHEDULING state, the node's id is not checked, and the sections registered to multiple nodes: $ ls -l /sys/devices/system/memory/memory21/node* total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 In that case, the system is able to boot but if later one of theses memory blocks is hot-unplugged and then hot-plugged, the sysfs inconsistency is detected and this is triggering a BUG_ON(): kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c This patch addresses the root cause by not relying on the system_state value to detect whether the call is due to a hot-plug operation. An extra parameter is added to link_mem_sections() detailing whether the operation is due to a hot-plug operation. [1] According to Oscar Salvador, using this qemu command line, ACPI memory hotplug operations are raised at SYSTEM_SCHEDULING state: $QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \ -m size=$MEM,slots=255,maxmem=4294967296k \ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \ Fixes: 4fbce633910e ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()") Signed-off-by: Laurent Dufour Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Fenghua Yu Cc: Nathan Lynch Cc: Scott Cheloha Cc: Tony Luck Cc: Link: https://lkml.kernel.org/r/20200915094143.79181-3-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds --- drivers/base/node.c | 85 +++++++++++++++++++++++++++++++++------------------- include/linux/node.h | 11 ++++--- mm/memory_hotplug.c | 3 +- 3 files changed, 64 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/base/node.c b/drivers/base/node.c index 508b80f6329b..50af16e68d98 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -761,14 +761,36 @@ static int __ref get_nid_for_pfn(unsigned long pfn) return pfn_to_nid(pfn); } +static int do_register_memory_block_under_node(int nid, + struct memory_block *mem_blk) +{ + int ret; + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. + */ + mem_blk->nid = nid; + + ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, + &mem_blk->dev.kobj, + kobject_name(&mem_blk->dev.kobj)); + if (ret) + return ret; + + return sysfs_create_link_nowarn(&mem_blk->dev.kobj, + &node_devices[nid]->dev.kobj, + kobject_name(&node_devices[nid]->dev.kobj)); +} + /* register memory section under specified node if it spans that node */ -static int register_mem_sect_under_node(struct memory_block *mem_blk, - void *arg) +static int register_mem_block_under_node_early(struct memory_block *mem_blk, + void *arg) { unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); unsigned long end_pfn = start_pfn + memory_block_pfns - 1; - int ret, nid = *(int *)arg; + int nid = *(int *)arg; unsigned long pfn; for (pfn = start_pfn; pfn <= end_pfn; pfn++) { @@ -785,38 +807,33 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, } /* - * We need to check if page belongs to nid only for the boot - * case, during hotplug we know that all pages in the memory - * block belong to the same node. - */ - if (system_state == SYSTEM_BOOTING) { - page_nid = get_nid_for_pfn(pfn); - if (page_nid < 0) - continue; - if (page_nid != nid) - continue; - } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. + * We need to check if page belongs to nid only at the boot + * case because node's ranges can be interleaved. */ - mem_blk->nid = nid; - - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, - &mem_blk->dev.kobj, - kobject_name(&mem_blk->dev.kobj)); - if (ret) - return ret; + page_nid = get_nid_for_pfn(pfn); + if (page_nid < 0) + continue; + if (page_nid != nid) + continue; - return sysfs_create_link_nowarn(&mem_blk->dev.kobj, - &node_devices[nid]->dev.kobj, - kobject_name(&node_devices[nid]->dev.kobj)); + return do_register_memory_block_under_node(nid, mem_blk); } /* mem section does not span the specified node */ return 0; } +/* + * During hotplug we know that all pages in the memory block belong to the same + * node. + */ +static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, + void *arg) +{ + int nid = *(int *)arg; + + return do_register_memory_block_under_node(nid, mem_blk); +} + /* * Unregister a memory block device under the node it spans. Memory blocks * with multiple nodes cannot be offlined and therefore also never be removed. @@ -832,11 +849,19 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } -int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) +int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, + enum meminit_context context) { + walk_memory_blocks_func_t func; + + if (context == MEMINIT_HOTPLUG) + func = register_mem_block_under_node_hotplug; + else + func = register_mem_block_under_node_early; + return walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn), (void *)&nid, - register_mem_sect_under_node); + func); } #ifdef CONFIG_HUGETLBFS diff --git a/include/linux/node.h b/include/linux/node.h index 4866f32a02d8..014ba3ab2efd 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -99,11 +99,13 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -extern int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn); +int link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long end_pfn, + enum meminit_context context) { return 0; } @@ -128,7 +130,8 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn); + error = link_mem_sections(nid, start_pfn, end_pfn, + MEMINIT_EARLY); } return error; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 345308a8bd15..ce3e73e3a5c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1080,7 +1080,8 @@ int __ref add_memory_resource(int nid, struct resource *res) } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); BUG_ON(ret); /* create new memmap entry */ -- cgit v1.2.3 From 008cfe4418b3dbda2ff820cdd7b1a5ce458ae444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:57 -0400 Subject: mm: Introduce mm_struct.has_pinned (Commit message majorly collected from Jason Gunthorpe) Reduce the chance of false positive from page_maybe_dma_pinned() by keeping track if the mm_struct has ever been used with pin_user_pages(). This allows cases that might drive up the page ref_count to avoid any penalty from handling dma_pinned pages. Future work is planned, to provide a more sophisticated solution, likely to turn it into a real counter. For now, make it atomic_t but use it as a boolean for simplicity. Suggested-by: Jason Gunthorpe Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 10 ++++++++++ kernel/fork.c | 1 + mm/gup.c | 6 ++++++ 3 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 496c3ff97cce..ed028af3cb19 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -436,6 +436,16 @@ struct mm_struct { */ atomic_t mm_count; + /** + * @has_pinned: Whether this mm has pinned any pages. This can + * be either replaced in the future by @pinned_vm when it + * becomes stable, or grow into a counter on its own. We're + * aggresive on this bit now - even if the pinned pages were + * unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm just for simplicity. + */ + atomic_t has_pinned; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..e65d8192d080 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); diff --git a/mm/gup.c b/mm/gup.c index 578bf5bd8bf8..dfe781d2ad4c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1255,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } + if (flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has @@ -2660,6 +2663,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, FOLL_FAST_ONLY))) return -EINVAL; + if (gup_flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); -- cgit v1.2.3 From 7a4830c380f3a8b3425f6383deff58e65b2557b5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:58 -0400 Subject: mm/fork: Pass new vma pointer into copy_page_range() This prepares for the future work to trigger early cow on pinned pages during fork(). No functional change intended. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- kernel/fork.c | 2 +- mm/memory.c | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 48614513eb66..16b799a0522c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1646,7 +1646,7 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *vma, struct vm_area_struct *new); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); diff --git a/kernel/fork.c b/kernel/fork.c index e65d8192d080..da8d360fb032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/memory.c b/mm/memory.c index f3eb55975902..d56178721452 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -819,6 +819,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; @@ -889,6 +890,7 @@ again: static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -915,7 +917,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; @@ -923,6 +925,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -949,7 +952,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; @@ -957,6 +960,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; @@ -971,14 +975,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) + struct vm_area_struct *vma, struct vm_area_struct *new) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -1033,7 +1037,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + vma, new, addr, next))) { ret = -ENOMEM; break; } -- cgit v1.2.3 From 62c59a8786e6bb75569cee91dab66e9da3ff4b68 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 25 Sep 2020 16:49:51 +0800 Subject: memstick: Skip allocating card when removing host After commit 6827ca573c03 ("memstick: rtsx_usb_ms: Support runtime power management"), removing module rtsx_usb_ms will be stuck. The deadlock is caused by powering on and powering off at the same time, the former one is when memstick_check() is flushed, and the later is called by memstick_remove_host(). Soe let's skip allocating card to prevent this issue. Fixes: 6827ca573c03 ("memstick: rtsx_usb_ms: Support runtime power management") Signed-off-by: Kai-Heng Feng Link: https://lore.kernel.org/r/20200925084952.13220-1-kai.heng.feng@canonical.com Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/memstick/core/memstick.c | 4 ++++ include/linux/memstick.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c index 693ee73eb291..ef03d6fafc5c 100644 --- a/drivers/memstick/core/memstick.c +++ b/drivers/memstick/core/memstick.c @@ -441,6 +441,9 @@ static void memstick_check(struct work_struct *work) } else if (host->card->stop) host->card->stop(host->card); + if (host->removing) + goto out_power_off; + card = memstick_alloc_card(host); if (!card) { @@ -545,6 +548,7 @@ EXPORT_SYMBOL(memstick_add_host); */ void memstick_remove_host(struct memstick_host *host) { + host->removing = 1; flush_workqueue(workqueue); mutex_lock(&host->lock); if (host->card) diff --git a/include/linux/memstick.h b/include/linux/memstick.h index da4c65f9435f..ebf73d4ee969 100644 --- a/include/linux/memstick.h +++ b/include/linux/memstick.h @@ -281,6 +281,7 @@ struct memstick_host { struct memstick_dev *card; unsigned int retries; + bool removing; /* Notify the host that some requests are pending. */ void (*request)(struct memstick_host *host); -- cgit v1.2.3 From a509a66a9d0d4f4e304d58fad38c078d0336c445 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 29 Sep 2020 15:25:22 +0200 Subject: arm64: permit ACPI core to map kernel memory used for table overrides Jonathan reports that the strict policy for memory mapped by the ACPI core breaks the use case of passing ACPI table overrides via initramfs. This is due to the fact that the memory type used for loading the initramfs in memory is not recognized as a memory type that is typically used by firmware to pass firmware tables. Since the purpose of the strict policy is to ensure that no AML or other ACPI code can manipulate any memory that is used by the kernel to keep its internal state or the state of user tasks, we can relax the permission check, and allow mappings of memory that is reserved and marked as NOMAP via memblock, and therefore not covered by the linear mapping to begin with. Fixes: 1583052d111f ("arm64/acpi: disallow AML memory opregions to access kernel memory") Fixes: 325f5585ec36 ("arm64/acpi: disallow writeable AML opregion mapping for EFI code regions") Reported-by: Jonathan Cameron Signed-off-by: Ard Biesheuvel Tested-by: Jonathan Cameron Cc: Sudeep Holla Cc: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20200929132522.18067-1-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi.c | 22 ++++++++++++++++++++-- include/linux/acpi.h | 2 +- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index a85174d05473..cada0b816c8a 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -298,8 +298,21 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: case EFI_PERSISTENT_MEMORY: - pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); - return NULL; + if (memblock_is_map_memory(phys) || + !memblock_is_region_memory(phys, size)) { + pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); + return NULL; + } + /* + * Mapping kernel memory is permitted if the region in + * question is covered by a single memblock with the + * NOMAP attribute set: this enables the use of ACPI + * table overrides passed via initramfs, which are + * reserved in memory using arch_reserve_mem_area() + * below. As this particular use case only requires + * read access, fall through to the R/O mapping case. + */ + fallthrough; case EFI_RUNTIME_SERVICES_CODE: /* @@ -388,3 +401,8 @@ int apei_claim_sea(struct pt_regs *regs) return err; } + +void arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + memblock_mark_nomap(addr, size); +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1e4cdc6c7ae2..64ae25c59d55 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -958,7 +958,7 @@ void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state, acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b); -#ifdef CONFIG_X86 +#ifndef CONFIG_IA64 void arch_reserve_mem_area(acpi_physical_address addr, size_t size); #else static inline void arch_reserve_mem_area(acpi_physical_address addr, -- cgit v1.2.3 From 472e5b056f000a778abb41f1e443de58eb259783 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 1 Oct 2020 19:14:36 -0700 Subject: pipe: remove pipe_wait() and fix wakeup race with splice The pipe splice code still used the old model of waiting for pipe IO by using a non-specific "pipe_wait()" that waited for any pipe event to happen, which depended on all pipe IO being entirely serialized by the pipe lock. So by checking the state you were waiting for, and then adding yourself to the wait queue before dropping the lock, you were guaranteed to see all the wakeups. Strictly speaking, the actual wakeups were not done under the lock, but the pipe_wait() model still worked, because since the waiter held the lock when checking whether it should sleep, it would always see the current state, and the wakeup was always done after updating the state. However, commit 0ddad21d3e99 ("pipe: use exclusive waits when reading or writing") split the single wait-queue into two, and in the process also made the "wait for event" code wait for _two_ wait queues, and that then showed a race with the wakers that were not serialized by the pipe lock. It's only splice that used that "pipe_wait()" model, so the problem wasn't obvious, but Josef Bacik reports: "I hit a hang with fstest btrfs/187, which does a btrfs send into /dev/null. This works by creating a pipe, the write side is given to the kernel to write into, and the read side is handed to a thread that splices into a file, in this case /dev/null. The box that was hung had the write side stuck here [pipe_write] and the read side stuck here [splice_from_pipe_next -> pipe_wait]. [ more details about pipe_wait() scenario ] The problem is we're doing the prepare_to_wait, which sets our state each time, however we can be woken up either with reads or writes. In the case above we race with the WRITER waking us up, and re-set our state to INTERRUPTIBLE, and thus never break out of schedule" Josef had a patch that avoided the issue in pipe_wait() by just making it set the state only once, but the deeper problem is that pipe_wait() depends on a level of synchonization by the pipe mutex that it really shouldn't. And the whole "wait for any pipe state change" model really isn't very good to begin with. So rather than trying to work around things in pipe_wait(), remove that legacy model of "wait for arbitrary pipe event" entirely, and actually create functions that wait for the pipe actually being readable or writable, and can do so without depending on the pipe lock serializing everything. Fixes: 0ddad21d3e99 ("pipe: use exclusive waits when reading or writing") Link: https://lore.kernel.org/linux-fsdevel/bfa88b5ad6f069b2b679316b9e495a970130416c.1601567868.git.josef@toxicpanda.com/ Reported-by: Josef Bacik Reviewed-and-tested-by: Josef Bacik Signed-off-by: Linus Torvalds --- fs/pipe.c | 62 +++++++++++++++++++++++++++++++---------------- fs/splice.c | 8 +++--- include/linux/pipe_fs_i.h | 5 ++-- 3 files changed, 48 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/fs/pipe.c b/fs/pipe.c index 60dbee457143..117db82b10af 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, } } -/* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct pipe_inode_info *pipe) -{ - DEFINE_WAIT(rdwait); - DEFINE_WAIT(wrwait); - - /* - * Pipes are system-local resources, so sleeping on them - * is considered a noninteractive wait: - */ - prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); - prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); - pipe_unlock(pipe); - schedule(); - finish_wait(&pipe->rd_wait, &rdwait); - finish_wait(&pipe->wr_wait, &wrwait); - pipe_lock(pipe); -} - static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -1035,12 +1016,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) return do_pipe2(fildes, 0); } +/* + * This is the stupid "wait for pipe to be readable or writable" + * model. + * + * See pipe_read/write() for the proper kind of exclusive wait, + * but that requires that we wake up any other readers/writers + * if we then do not end up reading everything (ie the whole + * "wake_next_reader/writer" logic in pipe_read/write()). + */ +void pipe_wait_readable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); + pipe_lock(pipe); +} + +void pipe_wait_writable(struct pipe_inode_info *pipe) +{ + pipe_unlock(pipe); + wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); + pipe_lock(pipe); +} + +/* + * This depends on both the wait (here) and the wakeup (wake_up_partner) + * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot + * race with the count check and waitqueue prep. + * + * Normally in order to avoid races, you'd do the prepare_to_wait() first, + * then check the condition you're waiting for, and only then sleep. But + * because of the pipe lock, we can check the condition before being on + * the wait queue. + * + * We use the 'rd_wait' waitqueue for pipe partner waiting. + */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { + DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { - pipe_wait(pipe); + prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); + pipe_unlock(pipe); + schedule(); + finish_wait(&pipe->rd_wait, &rdwait); + pipe_lock(pipe); if (signal_pending(current)) break; } @@ -1050,7 +1071,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); - wake_up_interruptible_all(&pipe->wr_wait); } static int fifo_open(struct inode *inode, struct file *filp) diff --git a/fs/splice.c b/fs/splice.c index d7c8a7c4db07..c3d00dfc7344 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -563,7 +563,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des sd->need_wakeup = false; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } return 1; @@ -1077,7 +1077,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe_wait(pipe); + pipe_wait_writable(pipe); } } @@ -1454,7 +1454,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -EAGAIN; break; } - pipe_wait(pipe); + pipe_wait_readable(pipe); } pipe_unlock(pipe); @@ -1493,7 +1493,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe_wait(pipe); + pipe_wait_writable(pipe); } pipe_unlock(pipe); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 50afd0d0084c..5d2705f1d01c 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -240,8 +240,9 @@ extern unsigned int pipe_max_size; extern unsigned long pipe_user_pages_hard; extern unsigned long pipe_user_pages_soft; -/* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct pipe_inode_info *pipe); +/* Wait for a pipe to be readable/writable while dropping the pipe lock */ +void pipe_wait_readable(struct pipe_inode_info *); +void pipe_wait_writable(struct pipe_inode_info *); struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); -- cgit v1.2.3 From be458311cdbb5d94820ffc4e40c5906085c0a507 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 1 Oct 2020 13:07:49 -0700 Subject: mm: memcg/slab: fix slab statistics in !SMP configuration Since commit ea426c2a7de8 ("mm: memcg: prepare for byte-sized vmstat items") the write side of slab counters accepts a value in bytes and converts it to pages. It happens in __mod_node_page_state(). However a non-SMP version of __mod_node_page_state() doesn't perform this conversion. It leads to incorrect (unrealistically high) slab counters values. Fix this by adding a similar conversion to the non-SMP version of __mod_node_page_state(). Signed-off-by: Roman Gushchin Reported-and-tested-by: Bastian Bittorf Fixes: ea426c2a7de8 ("mm: memcg: prepare for byte-sized vmstat items") Acked-by: Vlastimil Babka Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 91220ace31da..7557c1070fd7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -312,6 +312,11 @@ static inline void __mod_zone_page_state(struct zone *zone, static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { + if (vmstat_item_in_bytes(item)) { + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + node_page_state_add(delta, pgdat, item); } -- cgit v1.2.3