From ad1e03b2b3d4430baaa109b77bc308dc73050de3 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 10 Feb 2020 17:10:46 +0100 Subject: core: Don't skip generic XDP program execution for cloned SKBs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current generic XDP handler skips execution of XDP programs entirely if an SKB is marked as cloned. This leads to some surprising behaviour, as packets can end up being cloned in various ways, which will make an XDP program not see all the traffic on an interface. This was discovered by a simple test case where an XDP program that always returns XDP_DROP is installed on a veth device. When combining this with the Scapy packet sniffer (which uses an AF_PACKET) socket on the sending side, SKBs reliably end up in the cloned state, causing them to be passed through to the receiving interface instead of being dropped. A minimal reproducer script for this is included below. This patch fixed the issue by simply triggering the existing linearisation code for cloned SKBs instead of skipping the XDP program execution. This behaviour is in line with the behaviour of the native XDP implementation for the veth driver, which will reallocate and copy the SKB data if the SKB is marked as shared. Reproducer Python script (requires BCC and Scapy): from scapy.all import TCP, IP, Ether, sendp, sniff, AsyncSniffer, Raw, UDP from bcc import BPF import time, sys, subprocess, shlex SKB_MODE = (1 << 1) DRV_MODE = (1 << 2) PYTHON=sys.executable def client(): time.sleep(2) # Sniffing on the sender causes skb_cloned() to be set s = AsyncSniffer() s.start() for p in range(10): sendp(Ether(dst="aa:aa:aa:aa:aa:aa", src="cc:cc:cc:cc:cc:cc")/IP()/UDP()/Raw("Test"), verbose=False) time.sleep(0.1) s.stop() return 0 def server(mode): prog = BPF(text="int dummy_drop(struct xdp_md *ctx) {return XDP_DROP;}") func = prog.load_func("dummy_drop", BPF.XDP) prog.attach_xdp("a_to_b", func, mode) time.sleep(1) s = sniff(iface="a_to_b", count=10, timeout=15) if len(s): print(f"Got {len(s)} packets - should have gotten 0") return 1 else: print("Got no packets - as expected") return 0 if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") sys.exit(1) if sys.argv[1] == "client": sys.exit(client()) elif sys.argv[1] == "server": mode = SKB_MODE if sys.argv[2] == 'skb' else DRV_MODE sys.exit(server(mode)) else: try: mode = sys.argv[1] if mode not in ('skb', 'drv'): print(f"Usage: {sys.argv[0]} ") sys.exit(1) print(f"Running in {mode} mode") for cmd in [ 'ip netns add netns_a', 'ip netns add netns_b', 'ip -n netns_a link add a_to_b type veth peer name b_to_a netns netns_b', # Disable ipv6 to make sure there's no address autoconf traffic 'ip netns exec netns_a sysctl -qw net.ipv6.conf.a_to_b.disable_ipv6=1', 'ip netns exec netns_b sysctl -qw net.ipv6.conf.b_to_a.disable_ipv6=1', 'ip -n netns_a link set dev a_to_b address aa:aa:aa:aa:aa:aa', 'ip -n netns_b link set dev b_to_a address cc:cc:cc:cc:cc:cc', 'ip -n netns_a link set dev a_to_b up', 'ip -n netns_b link set dev b_to_a up']: subprocess.check_call(shlex.split(cmd)) server = subprocess.Popen(shlex.split(f"ip netns exec netns_a {PYTHON} {sys.argv[0]} server {mode}")) client = subprocess.Popen(shlex.split(f"ip netns exec netns_b {PYTHON} {sys.argv[0]} client")) client.wait() server.wait() sys.exit(server.returncode) finally: subprocess.run(shlex.split("ip netns delete netns_a")) subprocess.run(shlex.split("ip netns delete netns_b")) Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Reported-by: Stepan Horacek Suggested-by: Paolo Abeni Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a69e8bd7ed74..a6316b336128 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4527,14 +4527,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb) || skb_is_tc_redirected(skb)) + if (skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ - if (skb_is_nonlinear(skb) || + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; -- cgit v1.2.3 From e08ad80551b4b33c02f2fce1522f6c227d3976cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Feb 2020 07:53:53 -0800 Subject: net: add strict checks in netdev_name_node_alt_destroy() netdev_name_node_alt_destroy() does a lookup over all device names of a namespace. We need to make sure the name belongs to the device of interest, and that we do not destroy its primary name, since we rely on it being not deleted : dev->name_node would indeed point to freed memory. syzbot report was the following : BUG: KASAN: use-after-free in dev_net include/linux/netdevice.h:2206 [inline] BUG: KASAN: use-after-free in mld_force_mld_version net/ipv6/mcast.c:1172 [inline] BUG: KASAN: use-after-free in mld_in_v2_mode_only net/ipv6/mcast.c:1180 [inline] BUG: KASAN: use-after-free in mld_in_v1_mode+0x203/0x230 net/ipv6/mcast.c:1190 Read of size 8 at addr ffff88809886c588 by task swapper/1/0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:641 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135 dev_net include/linux/netdevice.h:2206 [inline] mld_force_mld_version net/ipv6/mcast.c:1172 [inline] mld_in_v2_mode_only net/ipv6/mcast.c:1180 [inline] mld_in_v1_mode+0x203/0x230 net/ipv6/mcast.c:1190 mld_send_initial_cr net/ipv6/mcast.c:2083 [inline] mld_dad_timer_expire+0x24/0x230 net/ipv6/mcast.c:2118 call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x6c3/0x1790 kernel/time/timer.c:1786 __do_softirq+0x262/0x98c kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x19b/0x1e0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:546 [inline] smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1146 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829 RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 Code: 68 73 c5 f9 eb 8a cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 94 be 59 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 84 be 59 00 fb f4 cc 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 de 2a 74 f9 e8 09 RSP: 0018:ffffc90000d3fd68 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13 RAX: 1ffffffff136761a RBX: ffff8880a99fc340 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000006 RDI: ffff8880a99fcbd4 RBP: ffffc90000d3fd98 R08: ffff8880a99fc340 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffffffff8aa5a1c0 R14: 0000000000000000 R15: 0000000000000001 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:686 default_idle_call+0x84/0xb0 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x3c8/0x6e0 kernel/sched/idle.c:269 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:361 start_secondary+0x2f4/0x410 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242 Allocated by task 10229: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] __kasan_kmalloc mm/kasan/common.c:515 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:488 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:529 __do_kmalloc_node mm/slab.c:3616 [inline] __kmalloc_node+0x4e/0x70 mm/slab.c:3623 kmalloc_node include/linux/slab.h:578 [inline] kvmalloc_node+0x68/0x100 mm/util.c:574 kvmalloc include/linux/mm.h:645 [inline] kvzalloc include/linux/mm.h:653 [inline] alloc_netdev_mqs+0x98/0xe40 net/core/dev.c:9797 rtnl_create_link+0x22d/0xaf0 net/core/rtnetlink.c:3047 __rtnl_newlink+0xf9f/0x1790 net/core/rtnetlink.c:3309 rtnl_newlink+0x69/0xa0 net/core/rtnetlink.c:3377 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 __sys_sendto+0x262/0x380 net/socket.c:1998 __do_compat_sys_socketcall net/compat.c:771 [inline] __se_compat_sys_socketcall net/compat.c:719 [inline] __ia32_compat_sys_socketcall+0x530/0x710 net/compat.c:719 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 Freed by task 10229: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] kasan_set_free_info mm/kasan/common.c:337 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:476 kasan_slab_free+0xe/0x10 mm/kasan/common.c:485 __cache_free mm/slab.c:3426 [inline] kfree+0x10a/0x2c0 mm/slab.c:3757 __netdev_name_node_alt_destroy+0x1ff/0x2a0 net/core/dev.c:322 netdev_name_node_alt_destroy+0x57/0x80 net/core/dev.c:334 rtnl_alt_ifname net/core/rtnetlink.c:3518 [inline] rtnl_linkprop.isra.0+0x575/0x6f0 net/core/rtnetlink.c:3567 rtnl_dellinkprop+0x46/0x60 net/core/rtnetlink.c:3588 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __compat_sys_sendmsg net/compat.c:642 [inline] __do_compat_sys_sendmsg net/compat.c:649 [inline] __se_compat_sys_sendmsg net/compat.c:646 [inline] __ia32_compat_sys_sendmsg+0x7a/0xb0 net/compat.c:646 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 The buggy address belongs to the object at ffff88809886c000 which belongs to the cache kmalloc-4k of size 4096 The buggy address is located 1416 bytes inside of 4096-byte region [ffff88809886c000, ffff88809886d000) The buggy address belongs to the page: page:ffffea0002621b00 refcount:1 mapcount:0 mapping:ffff8880aa402000 index:0x0 compound_mapcount: 0 flags: 0xfffe0000010200(slab|head) raw: 00fffe0000010200 ffffea0002610d08 ffffea0002607608 ffff8880aa402000 raw: 0000000000000000 ffff88809886c000 0000000100000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88809886c480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88809886c500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88809886c580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88809886c600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88809886c680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 36fbf1e52bd3 ("net: rtnetlink: add linkprop commands to add and delete alternative ifnames") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jiri Pirko Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a6316b336128..b6d13f3f1e5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -331,6 +331,12 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + __netdev_name_node_alt_destroy(name_node); return 0; -- cgit v1.2.3 From 7151affeef8d527f50b4b68a871fd28bd660023f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 15 Feb 2020 10:50:21 +0000 Subject: net: export netdev_next_lower_dev_rcu() netdev_next_lower_dev_rcu() will be used to implement a function, which is to walk all lower interfaces. There are already functions that they walk their lower interface. (netdev_walk_all_lower_dev_rcu, netdev_walk_all_lower_dev()). But, there would be cases that couldn't be covered by given netdev_walk_all_lower_dev_{rcu}() function. So, some modules would want to implement own function, which is to walk all lower interfaces. In the next patch, netdev_next_lower_dev_rcu() will be used. In addition, this patch removes two unused prototypes in netdevice.h. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++---- net/core/dev.c | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f1f633235f6..6c3f7032e8d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -72,6 +72,8 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ +#define MAX_NEST_DEV 8 + /* * Transmit return codes: transmit return codes originate from three different * namespaces: @@ -4389,11 +4391,8 @@ void *netdev_lower_get_next(struct net_device *dev, ldev; \ ldev = netdev_lower_get_next(dev, &(iter))) -struct net_device *netdev_all_lower_get_next(struct net_device *dev, +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); -struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, - struct list_head **iter); - int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, void *data), diff --git a/net/core/dev.c b/net/core/dev.c index b6d13f3f1e5a..2577ebfed293 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,7 +146,6 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 -#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -7207,8 +7206,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, return 0; } -static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; @@ -7220,6 +7219,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { -- cgit v1.2.3 From 379349e9bc3b42b8b2f8f7a03f64a97623fff323 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2020 18:15:44 +0100 Subject: Revert "net: dev: introduce support for sch BYPASS for lockless qdisc" This reverts commit ba27b4cdaaa66561aaedb2101876e563738d36fe Ahmed reported ouf-of-order issues bisected to commit ba27b4cdaaa6 ("net: dev: introduce support for sch BYPASS for lockless qdisc"). I can't find any working solution other than a plain revert. This will introduce some minor performance regressions for pfifo_fast qdisc. I plan to address them in net-next with more indirect call wrapper boilerplate for qdiscs. Reported-by: Ahmad Fatoum Fixes: ba27b4cdaaa6 ("net: dev: introduce support for sch BYPASS for lockless qdisc") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2577ebfed293..e10bd680dc03 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3662,26 +3662,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && - qdisc_run_begin(q)) { - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, - &q->state))) { - __qdisc_drop(skb, &to_free); - rc = NET_XMIT_DROP; - goto end_run; - } - qdisc_bstats_cpu_update(q, skb); - - rc = NET_XMIT_SUCCESS; - if (sch_direct_xmit(skb, q, dev, txq, NULL, true)) - __qdisc_run(q); - -end_run: - qdisc_run_end(q); - } else { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - qdisc_run(q); - } + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + qdisc_run(q); if (unlikely(to_free)) kfree_skb_list(to_free); -- cgit v1.2.3 From 6e11d1578fba8d09d03a286740ffcf336d53928c Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Mon, 24 Feb 2020 10:56:00 -0800 Subject: net: Fix Tx hash bound checking Fixes the lower and upper bounds when there are multiple TCs and traffic is on the the same TC on the same device. The lower bound is represented by 'qoffset' and the upper limit for hash value is 'qcount + qoffset'. This gives a clean Rx to Tx queue mapping when there are multiple TCs, as the queue indices for upper TCs will be offset by 'qoffset'. v2: Fixed commit description based on comments. Fixes: 1b837d489e06 ("net: Revoke export for __skb_tx_hash, update it to just be static skb_tx_hash") Fixes: eadec877ce9c ("net: Add support for subordinate traffic classes to netdev_pick_tx") Signed-off-by: Amritha Nambiar Reviewed-by: Alexander Duyck Reviewed-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e10bd680dc03..c6c985fe7b1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3076,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev, if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; -- cgit v1.2.3