From 6a2febec338df7e7699a52d00b2e1207dcf65b28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 30 Jun 2020 16:41:01 -0700 Subject: tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key() MD5 keys are read with RCU protection, and tcp_md5_do_add() might update in-place a prior key. Normally, typical RCU updates would allocate a new piece of memory. In this case only key->key and key->keylen might be updated, and we do not care if an incoming packet could see the old key, the new one, or some intermediate value, since changing the key on a live flow is known to be problematic anyway. We only want to make sure that in the case key->keylen is changed, cpus in tcp_md5_hash_key() wont try to use uninitialized data, or crash because key->keylen was read twice to feed sg_init_one() and ahash_request_set_crypt() Fixes: 9ea88a153001 ("tcp: md5: check md5 signature without socket lock") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 810cc164f795..f11166045324 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4033,10 +4033,13 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { + u8 keylen = key->keylen; struct scatterlist sg; - sg_init_one(&sg, key->key, key->keylen); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); + smp_rmb(); /* paired with smp_wmb() in tcp_md5_do_add() */ + + sg_init_one(&sg, key->key, keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_key); -- cgit v1.2.3 From e6ced831ef11a2a06e8d00aad9d4fc05b610bf38 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Wed, 1 Jul 2020 11:43:04 -0700 Subject: tcp: md5: refine tcp_md5_do_add()/tcp_md5_hash_key() barriers My prior fix went a bit too far, according to Herbert and Mathieu. Since we accept that concurrent TCP MD5 lookups might see inconsistent keys, we can use READ_ONCE()/WRITE_ONCE() instead of smp_rmb()/smp_wmb() Clearing all key->key[] is needed to avoid possible KMSAN reports, if key->keylen is increased. Since tcp_md5_do_add() is not fast path, using __GFP_ZERO to clear all struct tcp_md5sig_key is simpler. data_race() was added in linux-5.8 and will prevent KCSAN reports, this can safely be removed in stable backports, if data_race() is not yet backported. v2: use data_race() both in tcp_md5_hash_key() and tcp_md5_do_add() Fixes: 6a2febec338d ("tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Marco Elver <elver@google.com> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 19 ++++++++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f11166045324..c33f7c6aff8e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4033,14 +4033,14 @@ EXPORT_SYMBOL(tcp_md5_hash_skb_data); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) { - u8 keylen = key->keylen; + u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; - smp_rmb(); /* paired with smp_wmb() in tcp_md5_do_add() */ - sg_init_one(&sg, key->key, keylen); ahash_request_set_crypt(hp->md5_req, &sg, NULL, keylen); - return crypto_ahash_update(hp->md5_req); + + /* We use data_race() because tcp_md5_do_add() might change key->key under us */ + return data_race(crypto_ahash_update(hp->md5_req)); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 99916fcc15ca..04bfcbbfee83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1111,12 +1111,21 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { - /* Pre-existing entry - just update that one. */ - memcpy(key->key, newkey, newkeylen); + /* Pre-existing entry - just update that one. + * Note that the key might be used concurrently. + * data_race() is telling kcsan that we do not care of + * key mismatches, since changing MD5 key on live flows + * can lead to packet drops. + */ + data_race(memcpy(key->key, newkey, newkeylen)); - smp_wmb(); /* pairs with smp_rmb() in tcp_md5_hash_key() */ + /* Pairs with READ_ONCE() in tcp_md5_hash_key(). + * Also note that a reader could catch new key->keylen value + * but old key->key[], this is the reason we use __GFP_ZERO + * at sock_kmalloc() time below these lines. + */ + WRITE_ONCE(key->keylen, newkeylen); - key->keylen = newkeylen; return 0; } @@ -1132,7 +1141,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, rcu_assign_pointer(tp->md5sig_info, md5sig); } - key = sock_kmalloc(sk, sizeof(*key), gfp); + key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) return -ENOMEM; if (!tcp_alloc_md5sig_pool()) { -- cgit v1.2.3 From 1ca0fafd73c5268e8fc4b997094b8bb2bfe8deea Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Wed, 1 Jul 2020 18:39:33 -0700 Subject: tcp: md5: allow changing MD5 keys in all socket states This essentially reverts commit 721230326891 ("tcp: md5: reject TCP_MD5SIG or TCP_MD5SIG_EXT on established sockets") Mathieu reported that many vendors BGP implementations can actually switch TCP MD5 on established flows. Quoting Mathieu : Here is a list of a few network vendors along with their behavior with respect to TCP MD5: - Cisco: Allows for password to be changed, but within the hold-down timer (~180 seconds). - Juniper: When password is initially set on active connection it will reset, but after that any subsequent password changes no network resets. - Nokia: No notes on if they flap the tcp connection or not. - Ericsson/RedBack: Allows for 2 password (old/new) to co-exist until both sides are ok with new passwords. - Meta-Switch: Expects the password to be set before a connection is attempted, but no further info on whether they reset the TCP connection on a change. - Avaya: Disable the neighbor, then set password, then re-enable. - Zebos: Would normally allow the change when socket connected. We can revert my prior change because commit 9424e2e7ad93 ("tcp: md5: fix potential overestimation of TCP option space") removed the leak of 4 kernel bytes to the wire that was the main reason for my patch. While doing my investigations, I found a bug when a MD5 key is changed, leading to these commits that stable teams want to consider before backporting this revert : Commit 6a2febec338d ("tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key()") Commit e6ced831ef11 ("tcp: md5: refine tcp_md5_do_add()/tcp_md5_hash_key() barriers") Fixes: 721230326891 "tcp: md5: reject TCP_MD5SIG or TCP_MD5SIG_EXT on established sockets" Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c33f7c6aff8e..861fbd84c9cf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3246,10 +3246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: - if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) - err = tp->af_specific->md5_parse(sk, optname, optval, optlen); - else - err = -EINVAL; + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: -- cgit v1.2.3 From ce69e563b325f620863830c246a8698ccea52048 Mon Sep 17 00:00:00 2001 From: Christoph Paasch <cpaasch@apple.com> Date: Wed, 8 Jul 2020 16:18:34 -0700 Subject: tcp: make sure listeners don't initialize congestion-control state syzkaller found its way into setsockopt with TCP_CONGESTION "cdg". tcp_cdg_init() does a kcalloc to store the gradients. As sk_clone_lock just copies all the memory, the allocated pointer will be copied as well, if the app called setsockopt(..., TCP_CONGESTION) on the listener. If now the socket will be destroyed before the congestion-control has properly been initialized (through a call to tcp_init_transfer), we will end up freeing memory that does not belong to that particular socket, opening the door to a double-free: [ 11.413102] ================================================================== [ 11.414181] BUG: KASAN: double-free or invalid-free in tcp_cleanup_congestion_control+0x58/0xd0 [ 11.415329] [ 11.415560] CPU: 3 PID: 4884 Comm: syz-executor.5 Not tainted 5.8.0-rc2 #80 [ 11.416544] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 11.418148] Call Trace: [ 11.418534] <IRQ> [ 11.418834] dump_stack+0x7d/0xb0 [ 11.419297] print_address_description.constprop.0+0x1a/0x210 [ 11.422079] kasan_report_invalid_free+0x51/0x80 [ 11.423433] __kasan_slab_free+0x15e/0x170 [ 11.424761] kfree+0x8c/0x230 [ 11.425157] tcp_cleanup_congestion_control+0x58/0xd0 [ 11.425872] tcp_v4_destroy_sock+0x57/0x5a0 [ 11.426493] inet_csk_destroy_sock+0x153/0x2c0 [ 11.427093] tcp_v4_syn_recv_sock+0xb29/0x1100 [ 11.427731] tcp_get_cookie_sock+0xc3/0x4a0 [ 11.429457] cookie_v4_check+0x13d0/0x2500 [ 11.433189] tcp_v4_do_rcv+0x60e/0x780 [ 11.433727] tcp_v4_rcv+0x2869/0x2e10 [ 11.437143] ip_protocol_deliver_rcu+0x23/0x190 [ 11.437810] ip_local_deliver+0x294/0x350 [ 11.439566] __netif_receive_skb_one_core+0x15d/0x1a0 [ 11.441995] process_backlog+0x1b1/0x6b0 [ 11.443148] net_rx_action+0x37e/0xc40 [ 11.445361] __do_softirq+0x18c/0x61a [ 11.445881] asm_call_on_stack+0x12/0x20 [ 11.446409] </IRQ> [ 11.446716] do_softirq_own_stack+0x34/0x40 [ 11.447259] do_softirq.part.0+0x26/0x30 [ 11.447827] __local_bh_enable_ip+0x46/0x50 [ 11.448406] ip_finish_output2+0x60f/0x1bc0 [ 11.450109] __ip_queue_xmit+0x71c/0x1b60 [ 11.451861] __tcp_transmit_skb+0x1727/0x3bb0 [ 11.453789] tcp_rcv_state_process+0x3070/0x4d3a [ 11.456810] tcp_v4_do_rcv+0x2ad/0x780 [ 11.457995] __release_sock+0x14b/0x2c0 [ 11.458529] release_sock+0x4a/0x170 [ 11.459005] __inet_stream_connect+0x467/0xc80 [ 11.461435] inet_stream_connect+0x4e/0xa0 [ 11.462043] __sys_connect+0x204/0x270 [ 11.465515] __x64_sys_connect+0x6a/0xb0 [ 11.466088] do_syscall_64+0x3e/0x70 [ 11.466617] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 11.467341] RIP: 0033:0x7f56046dc469 [ 11.467844] Code: Bad RIP value. [ 11.468282] RSP: 002b:00007f5604dccdd8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a [ 11.469326] RAX: ffffffffffffffda RBX: 000000000068bf00 RCX: 00007f56046dc469 [ 11.470379] RDX: 0000000000000010 RSI: 0000000020000000 RDI: 0000000000000004 [ 11.471311] RBP: 00000000ffffffff R08: 0000000000000000 R09: 0000000000000000 [ 11.472286] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 11.473341] R13: 000000000041427c R14: 00007f5604dcd5c0 R15: 0000000000000003 [ 11.474321] [ 11.474527] Allocated by task 4884: [ 11.475031] save_stack+0x1b/0x40 [ 11.475548] __kasan_kmalloc.constprop.0+0xc2/0xd0 [ 11.476182] tcp_cdg_init+0xf0/0x150 [ 11.476744] tcp_init_congestion_control+0x9b/0x3a0 [ 11.477435] tcp_set_congestion_control+0x270/0x32f [ 11.478088] do_tcp_setsockopt.isra.0+0x521/0x1a00 [ 11.478744] __sys_setsockopt+0xff/0x1e0 [ 11.479259] __x64_sys_setsockopt+0xb5/0x150 [ 11.479895] do_syscall_64+0x3e/0x70 [ 11.480395] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 11.481097] [ 11.481321] Freed by task 4872: [ 11.481783] save_stack+0x1b/0x40 [ 11.482230] __kasan_slab_free+0x12c/0x170 [ 11.482839] kfree+0x8c/0x230 [ 11.483240] tcp_cleanup_congestion_control+0x58/0xd0 [ 11.483948] tcp_v4_destroy_sock+0x57/0x5a0 [ 11.484502] inet_csk_destroy_sock+0x153/0x2c0 [ 11.485144] tcp_close+0x932/0xfe0 [ 11.485642] inet_release+0xc1/0x1c0 [ 11.486131] __sock_release+0xc0/0x270 [ 11.486697] sock_close+0xc/0x10 [ 11.487145] __fput+0x277/0x780 [ 11.487632] task_work_run+0xeb/0x180 [ 11.488118] __prepare_exit_to_usermode+0x15a/0x160 [ 11.488834] do_syscall_64+0x4a/0x70 [ 11.489326] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Wei Wang fixed a part of these CDG-malloc issues with commit c12014440750 ("tcp: memset ca_priv data to 0 properly"). This patch here fixes the listener-scenario: We make sure that listeners setting the congestion-control through setsockopt won't initialize it (thus CDG never allocates on listeners). For those who use AF_UNSPEC to reuse a socket, tcp_disconnect() is changed to cleanup afterwards. (The issue can be reproduced at least down to v4.4.x.) Cc: Wei Wang <weiwan@google.com> Cc: Eric Dumazet <edumazet@google.com> Fixes: 2b0a8c9eee81 ("tcp: add CDG congestion control") Signed-off-by: Christoph Paasch <cpaasch@apple.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_cong.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 861fbd84c9cf..6f0caf9a866d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2691,6 +2691,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3172e31987be..62878cf26d9c 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -197,7 +197,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - if (sk->sk_state != TCP_CLOSE) + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } -- cgit v1.2.3