From dd29c67dbbbf97c8aa786a502977856d8eb9a73d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 09:04:35 -0700 Subject: af_unix: use DEBUG_NET_WARN_ON_ONCE() Replace four WARN_ON() that have not triggered recently with DEBUG_NET_WARN_ON_ONCE(). Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2206e6f8902d..3453e0053f76 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -302,7 +302,7 @@ static void __unix_remove_socket(struct sock *sk) static void __unix_insert_socket(struct sock *sk) { - WARN_ON(!sk_unhashed(sk)); + DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); sk_add_node(sk, &unix_socket_table[sk->sk_hash]); } @@ -554,9 +554,9 @@ static void unix_sock_destructor(struct sock *sk) u->oob_skb = NULL; } #endif - WARN_ON(refcount_read(&sk->sk_wmem_alloc)); - WARN_ON(!sk_unhashed(sk)); - WARN_ON(sk->sk_socket); + DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); + DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); + DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; -- cgit v1.2.3 From 965b57b469a589d64d81b1688b38dcb537011bb0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:12 -0700 Subject: net: Introduce a new proto_ops ->read_skb() Currently both splice() and sockmap use ->read_sock() to read skb from receive queue, but for sockmap we only read one entire skb at a time, so ->read_sock() is too conservative to use. Introduce a new proto_ops ->read_skb() which supports this sematic, with this we can finally pass the ownership of skb to recv actors. For non-TCP protocols, all ->read_sock() can be simply converted to ->read_skb(). Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-3-xiyou.wangcong@gmail.com --- include/linux/net.h | 4 ++++ include/net/tcp.h | 3 +-- include/net/udp.h | 3 +-- net/core/skmsg.c | 20 +++++--------------- net/ipv4/af_inet.c | 3 ++- net/ipv4/tcp.c | 9 +++------ net/ipv4/udp.c | 10 ++++------ net/ipv6/af_inet6.c | 3 ++- net/unix/af_unix.c | 23 +++++++++-------------- 9 files changed, 31 insertions(+), 47 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/linux/net.h b/include/linux/net.h index 12093f4db50c..a03485e8cbb2 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -152,6 +152,8 @@ struct module; struct sk_buff; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); +typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); + struct proto_ops { int family; @@ -214,6 +216,8 @@ struct proto_ops { */ int (*read_sock)(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); + /* This is different from read_sock(), it reads an entire skb at a time. */ + int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); int (*sendpage_locked)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, diff --git a/include/net/tcp.h b/include/net/tcp.h index 7547d90fbb57..8e48dc56837b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,8 +672,7 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); void tcp_initialize_rcv_mss(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index b60eea2e3fae..987f7fc7c0aa 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -306,8 +306,7 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 7e03f96e441b..f7f63b7d990c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1160,21 +1160,17 @@ static void sk_psock_done_strp(struct sk_psock *psock) } #endif /* CONFIG_BPF_STREAM_PARSER */ -static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, - unsigned int offset, size_t orig_len) +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = (struct sock *)desc->arg.data; struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = orig_len; + int len = skb->len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - desc->error = -ENOMEM; + if (!skb) return 0; - } rcu_read_lock(); psock = sk_psock(sk); @@ -1204,16 +1200,10 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; - read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; - - desc.arg.data = sk; - desc.error = 0; - desc.count = 1; - - sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); + sock->ops->read_skb(sk, sk_psock_verdict_recv); } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index da81f56fdd1c..7abd652a558f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = { .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, @@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 124f384f8695..9d2fd3ced21b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1734,8 +1734,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); -int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; @@ -1750,7 +1749,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, int used; __skb_unlink(skb, &sk->sk_receive_queue); - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1765,9 +1764,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, break; } consume_skb(skb); - if (!desc->count) - break; - WRITE_ONCE(tp->copied_seq, seq); + break; } WRITE_ONCE(tp->copied_seq, seq); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6172b4750a88..c660b0bc4d14 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1797,8 +1797,7 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -1820,7 +1819,7 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1831,13 +1830,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; } -EXPORT_SYMBOL(udp_read_sock); +EXPORT_SYMBOL(udp_read_skb); /* * This should be easy, if there is something there we diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 658823e91eca..0ee0770e79aa 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -702,6 +702,7 @@ const struct proto_ops inet6_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, @@ -727,7 +728,7 @@ const struct proto_ops inet6_dgram_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3453e0053f76..1bed3739768c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -741,10 +741,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -798,7 +796,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, - .read_sock = unix_stream_read_sock, + .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -823,7 +821,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, - .read_sock = unix_read_sock, + .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -2487,8 +2485,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si return __unix_dgram_recvmsg(sk, msg, size, flags); } -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -2503,7 +2500,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, if (!skb) return err; - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -2514,8 +2511,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; @@ -2650,13 +2646,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } #endif -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { if (unlikely(sk->sk_state != TCP_ESTABLISHED)) return -ENOTCONN; - return unix_read_sock(sk, desc, recv_actor); + return unix_read_skb(sk, recv_actor); } static int unix_stream_read_generic(struct unix_stream_read_state *state, -- cgit v1.2.3 From 340c3d337119ea177a98338be2e3bc62ee87ac80 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:08 -0700 Subject: af_unix: Clean up some sock_net() uses. Some functions define a net pointer only for one-shot use. Others call sock_net() redundantly even when a net pointer is available. Let's fix these and make the code simpler. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 33 ++++++++++++++------------------- net/unix/diag.c | 3 +-- 2 files changed, 15 insertions(+), 21 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3453e0053f76..990257f02e7c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -932,7 +932,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_unbound_socket(sk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; @@ -1293,9 +1293,8 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + struct sock *sk = sock->sk; struct sock *other; int err; @@ -1316,7 +1315,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type); + other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1404,15 +1403,13 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); + struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; - struct sock *newsk = NULL; - struct sock *other = NULL; + struct net *net = sock_net(sk); struct sk_buff *skb = NULL; - int st; - int err; long timeo; + int err; + int st; err = unix_validate_addr(sunaddr, addr_len); if (err) @@ -1432,7 +1429,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, */ /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); + newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); newsk = NULL; @@ -1840,17 +1837,15 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); - struct sock *other = NULL; - int err; - struct sk_buff *skb; - long timeo; + struct sock *sk = sock->sk, *other = NULL; + struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; + struct sk_buff *skb; int data_len = 0; int sk_locked; + long timeo; + int err; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1917,7 +1912,7 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(net, sunaddr, msg->msg_namelen, + other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); diff --git a/net/unix/diag.c b/net/unix/diag.c index bb0b5ea1655f..4e3dc8179fa4 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -308,7 +308,6 @@ out_nosk: static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct unix_diag_req); - struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -317,7 +316,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = unix_diag_dump, }; - return netlink_dump_start(net->diag_nlsk, skb, h, &c); + return netlink_dump_start(sock_net(skb->sk)->diag_nlsk, skb, h, &c); } else return unix_diag_get_exact(skb, h, nlmsg_data(h)); } -- cgit v1.2.3 From f302d180c6d430ea99643b9b2b3407aedaa36703 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:09 -0700 Subject: af_unix: Include the whole hash table size in UNIX_HASH_SIZE. Currently, the size of AF_UNIX hash table is UNIX_HASH_SIZE * 2, the first half for bind()ed sockets and the second half for unbound ones. UNIX_HASH_SIZE * 2 is used to define the table and iterate over it. In some places, we use ARRAY_SIZE(unix_socket_table) instead of UNIX_HASH_SIZE * 2. However, we cannot use it anymore because we will allocate the hash table dynamically. Then, we would have to add UNIX_HASH_SIZE * 2 in many places, which would be troublesome. This patch adapts the UNIX_HASH_SIZE definition to include bound and unbound sockets and defines a new UNIX_HASH_MOD macro to ease calculations. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/af_unix.h | 7 ++++--- net/unix/af_unix.c | 18 +++++++++--------- net/unix/diag.c | 6 ++---- 3 files changed, 15 insertions(+), 16 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index a7ef624ed726..acb56e463db1 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -16,12 +16,13 @@ void wait_for_unix_gc(void); struct sock *unix_get_socket(struct file *filp); struct sock *unix_peer_get(struct sock *sk); -#define UNIX_HASH_SIZE 256 +#define UNIX_HASH_MOD (256 - 1) +#define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_BITS 8 extern unsigned int unix_tot_inflight; -extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; -extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; +extern spinlock_t unix_table_locks[UNIX_HASH_SIZE]; +extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE]; struct unix_address { refcount_t refcnt; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 990257f02e7c..c0804ae9c96a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,9 +118,9 @@ #include "scm.h" -spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; +spinlock_t unix_table_locks[UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_table_locks); -struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; +struct hlist_head unix_socket_table[UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); static atomic_long_t unix_nr_socks; @@ -137,12 +137,12 @@ static unsigned int unix_unbound_hash(struct sock *sk) hash ^= hash >> 8; hash ^= sk->sk_type; - return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1)); + return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static unsigned int unix_bsd_hash(struct inode *i) { - return i->i_ino & (UNIX_HASH_SIZE - 1); + return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, @@ -155,14 +155,14 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, hash ^= hash >> 8; hash ^= type; - return hash & (UNIX_HASH_SIZE - 1); + return hash & UNIX_HASH_MOD; } static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) { /* hash1 and hash2 is never the same because - * one is between 0 and UNIX_HASH_SIZE - 1, and - * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. + * one is between 0 and UNIX_HASH_MOD, and + * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1. */ if (hash1 > hash2) swap(hash1, hash2); @@ -3239,7 +3239,7 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) unsigned long bucket = get_bucket(*pos); struct sock *sk; - while (bucket < ARRAY_SIZE(unix_socket_table)) { + while (bucket < UNIX_HASH_SIZE) { spin_lock(&unix_table_locks[bucket]); sk = unix_from_bucket(seq, pos); @@ -3666,7 +3666,7 @@ static int __init af_unix_init(void) BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) + for (i = 0; i < UNIX_HASH_SIZE; i++) spin_lock_init(&unix_table_locks[i]); rc = proto_register(&unix_dgram_proto, 1); diff --git a/net/unix/diag.c b/net/unix/diag.c index 4e3dc8179fa4..c5d1cca72aa5 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -204,9 +204,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_slot = cb->args[0]; num = s_num = cb->args[1]; - for (slot = s_slot; - slot < ARRAY_SIZE(unix_socket_table); - s_num = 0, slot++) { + for (slot = s_slot; slot < UNIX_HASH_SIZE; s_num = 0, slot++) { struct sock *sk; num = 0; @@ -242,7 +240,7 @@ static struct sock *unix_lookup_by_ino(unsigned int ino) struct sock *sk; int i; - for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { + for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock(&unix_table_locks[i]); sk_for_each(sk, &unix_socket_table[i]) if (ino == sock_i_ino(sk)) { -- cgit v1.2.3 From b6e811383062f88212082714db849127fa95142c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:10 -0700 Subject: af_unix: Define a per-netns hash table. This commit adds a per netns hash table for AF_UNIX, which size is fixed as UNIX_HASH_SIZE for now. The first implementation defines a per-netns hash table as a single array of lock and list: struct unix_hashbucket { spinlock_t lock; struct hlist_head head; }; struct netns_unix { struct unix_hashbucket *hash; ... }; But, Eric pointed out memory cost that the structure has holes because of sizeof(spinlock_t), which is 4 (or more if LOCKDEP is enabled). [0] It could be expensive on a host with thousands of netns and few AF_UNIX sockets. For this reason, a per-netns hash table uses two dense arrays. struct unix_table { spinlock_t *locks; struct hlist_head *buckets; }; struct netns_unix { struct unix_table table; ... }; Note the length of the list has a significant impact rather than lock contention, so having shared locks can be an option. But, per-netns locks and lists still perform better than the global locks and per-netns lists. [1] Also, this patch adds a change so that struct netns_unix disappears from struct net if CONFIG_UNIX is disabled. [0]: https://lore.kernel.org/netdev/CANn89iLVxO5aqx16azNU7p7Z-nz5NrnM5QTqOzueVxEnkVTxyg@mail.gmail.com/ [1]: https://lore.kernel.org/netdev/20220617175215.1769-1-kuniyu@amazon.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/net_namespace.h | 2 ++ include/net/netns/unix.h | 6 ++++++ net/unix/af_unix.c | 38 ++++++++++++++++++++++++++++++++------ 3 files changed, 40 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c4f5601f6e32..20a2992901c2 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -120,7 +120,9 @@ struct net { struct netns_core core; struct netns_mib mib; struct netns_packet packet; +#if IS_ENABLED(CONFIG_UNIX) struct netns_unix unx; +#endif struct netns_nexthop nexthop; struct netns_ipv4 ipv4; #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/netns/unix.h b/include/net/netns/unix.h index 91a3d7e39198..6f1a33df061d 100644 --- a/include/net/netns/unix.h +++ b/include/net/netns/unix.h @@ -5,8 +5,14 @@ #ifndef __NETNS_UNIX_H__ #define __NETNS_UNIX_H__ +struct unix_table { + spinlock_t *locks; + struct hlist_head *buckets; +}; + struct ctl_table_header; struct netns_unix { + struct unix_table table; int sysctl_max_dgram_qlen; struct ctl_table_header *ctl; }; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c0804ae9c96a..cdd12881a39d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3559,7 +3559,7 @@ static const struct net_proto_family unix_family_ops = { static int __net_init unix_net_init(struct net *net) { - int error = -ENOMEM; + int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) @@ -3567,18 +3567,44 @@ static int __net_init unix_net_init(struct net *net) #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, - sizeof(struct seq_net_private))) { - unix_sysctl_unregister(net); - goto out; + sizeof(struct seq_net_private))) + goto err_sysctl; +#endif + + net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, + sizeof(spinlock_t), GFP_KERNEL); + if (!net->unx.table.locks) + goto err_proc; + + net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!net->unx.table.buckets) + goto free_locks; + + for (i = 0; i < UNIX_HASH_SIZE; i++) { + spin_lock_init(&net->unx.table.locks[i]); + INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } + + return 0; + +free_locks: + kvfree(net->unx.table.locks); +err_proc: +#ifdef CONFIG_PROC_FS + remove_proc_entry("unix", net->proc_net); +err_sysctl: #endif - error = 0; + unix_sysctl_unregister(net); out: - return error; + return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { + kvfree(net->unx.table.buckets); + kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } -- cgit v1.2.3 From 79b05beaa5c340e1649dc7462e056ae33b916131 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:11 -0700 Subject: af_unix: Acquire/Release per-netns hash table's locks. This commit adds extra spin_lock/spin_unlock() for a per-netns hash table inside the existing ones for unix_table_locks. As of this commit, sockets are still linked in the global hash table. After putting sockets in a per-netns hash table and removing the old one in the next patch, we remove the global locks in the last patch. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 75 +++++++++++++++++++++++++++++++++++++----------------- net/unix/diag.c | 23 +++++++++++------ 2 files changed, 66 insertions(+), 32 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cdd12881a39d..79f8fc5cdce8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -158,7 +158,8 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, return hash & UNIX_HASH_MOD; } -static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +static void unix_table_double_lock(struct net *net, + unsigned int hash1, unsigned int hash2) { /* hash1 and hash2 is never the same because * one is between 0 and UNIX_HASH_MOD, and @@ -169,10 +170,17 @@ static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) spin_lock(&unix_table_locks[hash1]); spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); + + spin_lock(&net->unx.table.locks[hash1]); + spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); } -static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +static void unix_table_double_unlock(struct net *net, + unsigned int hash1, unsigned int hash2) { + spin_unlock(&net->unx.table.locks[hash1]); + spin_unlock(&net->unx.table.locks[hash2]); + spin_unlock(&unix_table_locks[hash1]); spin_unlock(&unix_table_locks[hash2]); } @@ -316,17 +324,21 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, __unix_insert_socket(sk); } -static void unix_remove_socket(struct sock *sk) +static void unix_remove_socket(struct net *net, struct sock *sk) { spin_lock(&unix_table_locks[sk->sk_hash]); + spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); + spin_unlock(&net->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); } -static void unix_insert_unbound_socket(struct sock *sk) +static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&unix_table_locks[sk->sk_hash]); + spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(sk); + spin_unlock(&net->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); } @@ -356,28 +368,33 @@ static inline struct sock *unix_find_socket_byname(struct net *net, struct sock *s; spin_lock(&unix_table_locks[hash]); + spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); + spin_unlock(&net->unx.table.locks[hash]); spin_unlock(&unix_table_locks[hash]); return s; } -static struct sock *unix_find_socket_byinode(struct inode *i) +static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&unix_table_locks[hash]); + spin_lock(&net->unx.table.locks[hash]); sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); + spin_unlock(&net->unx.table.locks[hash]); spin_unlock(&unix_table_locks[hash]); return s; } } + spin_unlock(&net->unx.table.locks[hash]); spin_unlock(&unix_table_locks[hash]); return NULL; } @@ -576,12 +593,12 @@ static void unix_sock_destructor(struct sock *sk) static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); - struct path path; struct sock *skpair; struct sk_buff *skb; + struct path path; int state; - unix_remove_socket(sk); + unix_remove_socket(sock_net(sk), sk); /* Clear state */ unix_state_lock(sk); @@ -930,7 +947,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_unbound_socket(sk); + unix_insert_unbound_socket(net, sk); sock_prot_inuse_add(net, sk->sk_prot, 1); @@ -1015,7 +1032,7 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, if (!S_ISSOCK(inode->i_mode)) goto path_put; - sk = unix_find_socket_byinode(inode); + sk = unix_find_socket_byinode(net, inode); if (!sk) goto path_put; @@ -1074,6 +1091,7 @@ static int unix_autobind(struct sock *sk) { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; @@ -1102,11 +1120,10 @@ retry: sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - new_hash)) { - unix_table_double_unlock(old_hash, new_hash); + if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { + unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. @@ -1124,7 +1141,7 @@ retry: } __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); @@ -1138,6 +1155,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct user_namespace *ns; // barf... struct unix_address *addr; struct dentry *dentry; @@ -1178,11 +1196,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; new_hash = unix_bsd_hash(d_backing_inode(dentry)); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -1205,6 +1223,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct unix_address *addr; int err; @@ -1222,19 +1241,18 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, } new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - new_hash)) + if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); @@ -3237,15 +3255,18 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct net *net = seq_file_net(seq); struct sock *sk; while (bucket < UNIX_HASH_SIZE) { spin_lock(&unix_table_locks[bucket]); + spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; + spin_unlock(&net->unx.table.locks[bucket]); spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3258,11 +3279,13 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct net *net = seq_file_net(seq); for (sk = sk_next(sk); sk; sk = sk_next(sk)) - if (sock_net(sk) == seq_file_net(seq)) + if (sock_net(sk) == net) return sk; + spin_unlock(&net->unx.table.locks[bucket]); spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3292,8 +3315,10 @@ static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; - if (sk) + if (sk) { + spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); + } } static int unix_seq_show(struct seq_file *seq, void *v) @@ -3381,6 +3406,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; + struct net *net = seq_file_net(seq); unsigned int expected = 1; struct sock *sk; @@ -3388,7 +3414,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { - if (sock_net(sk) != seq_file_net(seq)) + if (sock_net(sk) != net) continue; if (iter->end_sk < iter->max_sk) { @@ -3399,6 +3425,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) expected++; } + spin_unlock(&net->unx.table.locks[start_sk->sk_hash]); spin_unlock(&unix_table_locks[start_sk->sk_hash]); return expected; diff --git a/net/unix/diag.c b/net/unix/diag.c index c5d1cca72aa5..7fc377435114 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -195,9 +195,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct unix_diag_req *req; - int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; + struct unix_diag_req *req; req = nlmsg_data(cb->nlh); @@ -209,6 +209,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = 0; spin_lock(&unix_table_locks[slot]); + spin_lock(&net->unx.table.locks[slot]); sk_for_each(sk, &unix_socket_table[slot]) { if (!net_eq(sock_net(sk), net)) continue; @@ -220,12 +221,14 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) { + spin_unlock(&net->unx.table.locks[slot]); spin_unlock(&unix_table_locks[slot]); goto done; } next: num++; } + spin_unlock(&net->unx.table.locks[slot]); spin_unlock(&unix_table_locks[slot]); } done: @@ -235,19 +238,22 @@ done: return skb->len; } -static struct sock *unix_lookup_by_ino(unsigned int ino) +static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino) { struct sock *sk; int i; for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock(&unix_table_locks[i]); + spin_lock(&net->unx.table.locks[i]); sk_for_each(sk, &unix_socket_table[i]) if (ino == sock_i_ino(sk)) { sock_hold(sk); + spin_unlock(&net->unx.table.locks[i]); spin_unlock(&unix_table_locks[i]); return sk; } + spin_unlock(&net->unx.table.locks[i]); spin_unlock(&unix_table_locks[i]); } return NULL; @@ -257,16 +263,17 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh, struct unix_diag_req *req) { - int err = -EINVAL; - struct sock *sk; - struct sk_buff *rep; - unsigned int extra_len; struct net *net = sock_net(in_skb->sk); + unsigned int extra_len; + struct sk_buff *rep; + struct sock *sk; + int err; + err = -EINVAL; if (req->udiag_ino == 0) goto out_nosk; - sk = unix_lookup_by_ino(req->udiag_ino); + sk = unix_lookup_by_ino(net, req->udiag_ino); err = -ENOENT; if (sk == NULL) goto out_nosk; -- cgit v1.2.3 From cf2f225e2653734e66e91c09e1cbe004bfd3d4a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:12 -0700 Subject: af_unix: Put a socket into a per-netns hash table. This commit replaces the global hash table with a per-netns one and removes the global one. We now link a socket in each netns's hash table so we can save some netns comparisons when iterating through a hash bucket. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/af_unix.h | 1 - net/unix/af_unix.c | 50 ++++++++++++++++++++------------------------------ net/unix/diag.c | 9 +++------ 3 files changed, 23 insertions(+), 37 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index acb56e463db1..b1748c9b6db2 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -22,7 +22,6 @@ struct sock *unix_peer_get(struct sock *sk); extern unsigned int unix_tot_inflight; extern spinlock_t unix_table_locks[UNIX_HASH_SIZE]; -extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE]; struct unix_address { refcount_t refcnt; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 79f8fc5cdce8..9d0b07235dbc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -120,8 +120,6 @@ spinlock_t unix_table_locks[UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_table_locks); -struct hlist_head unix_socket_table[UNIX_HASH_SIZE]; -EXPORT_SYMBOL_GPL(unix_socket_table); static atomic_long_t unix_nr_socks; /* SMP locking strategy: @@ -308,20 +306,20 @@ static void __unix_remove_socket(struct sock *sk) sk_del_node_init(sk); } -static void __unix_insert_socket(struct sock *sk) +static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); - sk_add_node(sk, &unix_socket_table[sk->sk_hash]); + sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } -static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, - unsigned int hash) +static void __unix_set_addr_hash(struct net *net, struct sock *sk, + struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; - __unix_insert_socket(sk); + __unix_insert_socket(net, sk); } static void unix_remove_socket(struct net *net, struct sock *sk) @@ -337,7 +335,7 @@ static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&unix_table_locks[sk->sk_hash]); spin_lock(&net->unx.table.locks[sk->sk_hash]); - __unix_insert_socket(sk); + __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); } @@ -348,12 +346,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, { struct sock *s; - sk_for_each(s, &unix_socket_table[hash]) { + sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); - if (!net_eq(sock_net(s), net)) - continue; - if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; @@ -384,7 +379,7 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) spin_lock(&unix_table_locks[hash]); spin_lock(&net->unx.table.locks[hash]); - sk_for_each(s, &unix_socket_table[hash]) { + sk_for_each(s, &net->unx.table.buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { @@ -1140,7 +1135,7 @@ retry: goto retry; } - __unix_set_addr_hash(sk, addr, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); err = 0; @@ -1199,7 +1194,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); - __unix_set_addr_hash(sk, addr, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); @@ -1246,7 +1241,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; - __unix_set_addr_hash(sk, addr, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; @@ -3239,12 +3234,11 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); - struct sock *sk; unsigned long count = 0; + struct sock *sk; - for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) { - if (sock_net(sk) != seq_file_net(seq)) - continue; + for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); + sk; sk = sk_next(sk)) { if (++count == offset) break; } @@ -3279,13 +3273,13 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); - struct net *net = seq_file_net(seq); - for (sk = sk_next(sk); sk; sk = sk_next(sk)) - if (sock_net(sk) == net) - return sk; + sk = sk_next(sk); + if (sk) + return sk; + - spin_unlock(&net->unx.table.locks[bucket]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3406,7 +3400,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; - struct net *net = seq_file_net(seq); unsigned int expected = 1; struct sock *sk; @@ -3414,9 +3407,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { - if (sock_net(sk) != net) - continue; - if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; @@ -3425,7 +3415,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) expected++; } - spin_unlock(&net->unx.table.locks[start_sk->sk_hash]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); spin_unlock(&unix_table_locks[start_sk->sk_hash]); return expected; diff --git a/net/unix/diag.c b/net/unix/diag.c index 7fc377435114..4d0f0ca6a1eb 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -210,9 +210,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = 0; spin_lock(&unix_table_locks[slot]); spin_lock(&net->unx.table.locks[slot]); - sk_for_each(sk, &unix_socket_table[slot]) { - if (!net_eq(sock_net(sk), net)) - continue; + sk_for_each(sk, &net->unx.table.buckets[slot]) { if (num < s_num) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) @@ -246,13 +244,14 @@ static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino) for (i = 0; i < UNIX_HASH_SIZE; i++) { spin_lock(&unix_table_locks[i]); spin_lock(&net->unx.table.locks[i]); - sk_for_each(sk, &unix_socket_table[i]) + sk_for_each(sk, &net->unx.table.buckets[i]) { if (ino == sock_i_ino(sk)) { sock_hold(sk); spin_unlock(&net->unx.table.locks[i]); spin_unlock(&unix_table_locks[i]); return sk; } + } spin_unlock(&net->unx.table.locks[i]); spin_unlock(&unix_table_locks[i]); } @@ -277,8 +276,6 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, err = -ENOENT; if (sk == NULL) goto out_nosk; - if (!net_eq(sock_net(sk), net)) - goto out; err = sock_diag_check_cookie(sk, req->udiag_cookie); if (err) -- cgit v1.2.3 From 2f7ca90a0188b57a54d3b1159eb7874427a7e07a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:13 -0700 Subject: af_unix: Remove unix_table_locks. unix_table_locks are to protect the global hash table, unix_socket_table. The previous commit removed it, so let's clean up the unnecessary locks. Here is a test result on EC2 c5.9xlarge where 10 processes run concurrently in different netns and bind 100,000 sockets for each. without this series : 1m 38s with this series : 11s It is ~10x faster because the global hash table is split into 10 netns in this case. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/af_unix.h | 1 - net/unix/af_unix.c | 42 ++++++++---------------------------------- net/unix/diag.c | 8 +------- 3 files changed, 9 insertions(+), 42 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b1748c9b6db2..480fa579787e 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -21,7 +21,6 @@ struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_BITS 8 extern unsigned int unix_tot_inflight; -extern spinlock_t unix_table_locks[UNIX_HASH_SIZE]; struct unix_address { refcount_t refcnt; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9d0b07235dbc..49f6626330c3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,13 +118,11 @@ #include "scm.h" -spinlock_t unix_table_locks[UNIX_HASH_SIZE]; -EXPORT_SYMBOL_GPL(unix_table_locks); static atomic_long_t unix_nr_socks; /* SMP locking strategy: - * hash table is protected with spinlock unix_table_locks - * each socket state is protected by separate spin lock. + * hash table is protected with spinlock. + * each socket state is protected by separate spinlock. */ static unsigned int unix_unbound_hash(struct sock *sk) @@ -166,9 +164,6 @@ static void unix_table_double_lock(struct net *net, if (hash1 > hash2) swap(hash1, hash2); - spin_lock(&unix_table_locks[hash1]); - spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); - spin_lock(&net->unx.table.locks[hash1]); spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); } @@ -178,9 +173,6 @@ static void unix_table_double_unlock(struct net *net, { spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); - - spin_unlock(&unix_table_locks[hash1]); - spin_unlock(&unix_table_locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK @@ -324,20 +316,16 @@ static void __unix_set_addr_hash(struct net *net, struct sock *sk, static void unix_remove_socket(struct net *net, struct sock *sk) { - spin_lock(&unix_table_locks[sk->sk_hash]); spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); - spin_unlock(&unix_table_locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { - spin_lock(&unix_table_locks[sk->sk_hash]); spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); - spin_unlock(&unix_table_locks[sk->sk_hash]); } static struct sock *__unix_find_socket_byname(struct net *net, @@ -362,13 +350,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, { struct sock *s; - spin_lock(&unix_table_locks[hash]); spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); - spin_unlock(&unix_table_locks[hash]); return s; } @@ -377,7 +363,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&unix_table_locks[hash]); spin_lock(&net->unx.table.locks[hash]); sk_for_each(s, &net->unx.table.buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; @@ -385,12 +370,10 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); - spin_unlock(&unix_table_locks[hash]); return s; } } spin_unlock(&net->unx.table.locks[hash]); - spin_unlock(&unix_table_locks[hash]); return NULL; } @@ -1551,9 +1534,9 @@ restart: * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_locks. Insertion - * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_locks, + * otheru in hash under its lock. Insertion into the + * hash chain we'd found it in had been done in an + * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -3253,7 +3236,6 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) struct sock *sk; while (bucket < UNIX_HASH_SIZE) { - spin_lock(&unix_table_locks[bucket]); spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); @@ -3261,7 +3243,6 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) return sk; spin_unlock(&net->unx.table.locks[bucket]); - spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } @@ -3280,7 +3261,6 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); - spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3309,10 +3289,8 @@ static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; - if (sk) { + if (sk) spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); - spin_unlock(&unix_table_locks[sk->sk_hash]); - } } static int unix_seq_show(struct seq_file *seq, void *v) @@ -3337,7 +3315,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { // under unix_table_locks here + if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); @@ -3416,7 +3394,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) } spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); - spin_unlock(&unix_table_locks[start_sk->sk_hash]); return expected; } @@ -3705,13 +3682,10 @@ static void __init bpf_iter_register(void) static int __init af_unix_init(void) { - int i, rc = -1; + int rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - for (i = 0; i < UNIX_HASH_SIZE; i++) - spin_lock_init(&unix_table_locks[i]); - rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); diff --git a/net/unix/diag.c b/net/unix/diag.c index 4d0f0ca6a1eb..105f522a89fe 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -13,7 +13,7 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - /* might or might not have unix_table_locks */ + /* might or might not have a hash table lock */ struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) @@ -208,7 +208,6 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct sock *sk; num = 0; - spin_lock(&unix_table_locks[slot]); spin_lock(&net->unx.table.locks[slot]); sk_for_each(sk, &net->unx.table.buckets[slot]) { if (num < s_num) @@ -220,14 +219,12 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) { spin_unlock(&net->unx.table.locks[slot]); - spin_unlock(&unix_table_locks[slot]); goto done; } next: num++; } spin_unlock(&net->unx.table.locks[slot]); - spin_unlock(&unix_table_locks[slot]); } done: cb->args[0] = slot; @@ -242,18 +239,15 @@ static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino) int i; for (i = 0; i < UNIX_HASH_SIZE; i++) { - spin_lock(&unix_table_locks[i]); spin_lock(&net->unx.table.locks[i]); sk_for_each(sk, &net->unx.table.buckets[i]) { if (ino == sock_i_ino(sk)) { sock_hold(sk); spin_unlock(&net->unx.table.locks[i]); - spin_unlock(&unix_table_locks[i]); return sk; } } spin_unlock(&net->unx.table.locks[i]); - spin_unlock(&unix_table_locks[i]); } return NULL; } -- cgit v1.2.3 From 51bae889fe111e418321ff0e6bb5f67e64cb9042 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 2 Jul 2022 08:48:17 -0700 Subject: af_unix: Put pathname sockets in the global hash table. Commit cf2f225e2653 ("af_unix: Put a socket into a per-netns hash table.") accidentally broke user API for pathname sockets. A socket was able to connect() to a pathname socket whose file was visible even if they were in different network namespaces. The commit puts all sockets into a per-netns hash table. As a result, connect() to a pathname socket in a different netns fails to find it in the caller's per-netns hash table and returns -ECONNREFUSED even when the task can view the peer socket file. We can reproduce this issue by: Console A: # python3 >>> from socket import * >>> s = socket(AF_UNIX, SOCK_STREAM, 0) >>> s.bind('test') >>> s.listen(32) Console B: # ip netns add test # ip netns exec test sh # python3 >>> from socket import * >>> s = socket(AF_UNIX, SOCK_STREAM, 0) >>> s.connect('test') Note when dumping sockets by sock_diag, procfs, and bpf_iter, they are filtered only by netns. In other words, even if they are visible and connect()able, all sockets in different netns are skipped while iterating sockets. Thus, we need a fix only for finding a peer pathname socket. This patch adds a global hash table for pathname sockets, links them with sk_bind_node, and uses it in unix_find_socket_byinode(). By doing so, we can keep sockets in per-netns hash tables and dump them easily. Thanks to Sachin Sant and Leonard Crestez for reports, logs and a reproducer. Fixes: cf2f225e2653 ("af_unix: Put a socket into a per-netns hash table.") Reported-by: Sachin Sant Reported-by: Leonard Crestez Tested-by: Sachin Sant Tested-by: Nathan Chancellor Signed-off-by: Kuniyuki Iwashima Tested-by: Leonard Crestez Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 49f6626330c3..526b872cc710 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -119,6 +119,8 @@ #include "scm.h" static atomic_long_t unix_nr_socks; +static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; +static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: * hash table is protected with spinlock. @@ -328,6 +330,24 @@ static void unix_insert_unbound_socket(struct net *net, struct sock *sk) spin_unlock(&net->unx.table.locks[sk->sk_hash]); } +static void unix_insert_bsd_socket(struct sock *sk) +{ + spin_lock(&bsd_socket_locks[sk->sk_hash]); + sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); + spin_unlock(&bsd_socket_locks[sk->sk_hash]); +} + +static void unix_remove_bsd_socket(struct sock *sk) +{ + if (!hlist_unhashed(&sk->sk_bind_node)) { + spin_lock(&bsd_socket_locks[sk->sk_hash]); + __sk_del_bind_node(sk); + spin_unlock(&bsd_socket_locks[sk->sk_hash]); + + sk_node_init(&sk->sk_bind_node); + } +} + static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) @@ -358,22 +378,22 @@ static inline struct sock *unix_find_socket_byname(struct net *net, return s; } -static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) +static struct sock *unix_find_socket_byinode(struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&net->unx.table.locks[hash]); - sk_for_each(s, &net->unx.table.buckets[hash]) { + spin_lock(&bsd_socket_locks[hash]); + sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - spin_unlock(&net->unx.table.locks[hash]); + spin_unlock(&bsd_socket_locks[hash]); return s; } } - spin_unlock(&net->unx.table.locks[hash]); + spin_unlock(&bsd_socket_locks[hash]); return NULL; } @@ -577,6 +597,7 @@ static void unix_release_sock(struct sock *sk, int embrion) int state; unix_remove_socket(sock_net(sk), sk); + unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); @@ -988,8 +1009,8 @@ static int unix_release(struct socket *sock) return 0; } -static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type) +static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, + int type) { struct inode *inode; struct path path; @@ -1010,7 +1031,7 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, if (!S_ISSOCK(inode->i_mode)) goto path_put; - sk = unix_find_socket_byinode(net, inode); + sk = unix_find_socket_byinode(inode); if (!sk) goto path_put; @@ -1058,7 +1079,7 @@ static struct sock *unix_find_other(struct net *net, struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, addr_len, type); + sk = unix_find_bsd(sunaddr, addr_len, type); else sk = unix_find_abstract(net, sunaddr, addr_len, type); @@ -1179,6 +1200,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, u->path.dentry = dget(dentry); __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); + unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -3682,10 +3704,15 @@ static void __init bpf_iter_register(void) static int __init af_unix_init(void) { - int rc = -1; + int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); + for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { + spin_lock_init(&bsd_socket_locks[i]); + INIT_HLIST_HEAD(&bsd_socket_buckets[i]); + } + rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); -- cgit v1.2.3 From cf21b355ccb39b0de0b6a7362532bb5584c84a80 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 5 Jul 2022 16:37:15 -0700 Subject: af_unix: Optimise hash table layout. Commit 6dd4142fb5a9 ("Merge branch 'af_unix-per-netns-socket-hash'") and commit 51bae889fe11 ("af_unix: Put pathname sockets in the global hash table.") changed a hash table layout. Before: unix_socket_table [0 - 255] : abstract & pathname sockets [256 - 511] : unnamed sockets After: per-netns table [0 - 255] : abstract & pathname sockets [256 - 511] : unnamed sockets bsd_socket_table [0 - 255] : pathname sockets (sk_bind_node) Now, while looking up sockets, we traverse the global table for the pathname sockets and the first half of each per-netns hash table for abstract sockets, where pathname sockets are also linked. Thus, the more pathname sockets we have, the longer we take to look up abstract sockets. This characteristic has been there before the layout change, but we can improve it now. This patch changes the per-netns hash table's layout so that sockets not requiring lookup reside in the first half and do not impact the lookup of abstract sockets. per-netns table [0 - 255] : pathname & unnamed sockets [256 - 511] : abstract sockets bsd_socket_table [0 - 255] : pathname sockets (sk_bind_node) We have run a test that bind()s 100,000 abstract/pathname sockets for each, bind()s an abstract socket 100,000 times and measures the time on __unix_find_socket_byname(). The result shows that the patch makes each lookup faster. Without this patch: $ sudo ./funclatency -p 2278 --microseconds __unix_find_socket_byname.isra.44 usec : count distribution 0 -> 1 : 0 | | 2 -> 3 : 0 | | 4 -> 7 : 0 | | 8 -> 15 : 126 | | 16 -> 31 : 1438 |* | 32 -> 63 : 4150 |*** | 64 -> 127 : 9049 |******* | 128 -> 255 : 37704 |******************************* | 256 -> 511 : 47533 |****************************************| With this patch: $ sudo ./funclatency -p 3648 --microseconds __unix_find_socket_byname.isra.46 usec : count distribution 0 -> 1 : 109 | | 2 -> 3 : 318 | | 4 -> 7 : 725 | | 8 -> 15 : 2501 |* | 16 -> 31 : 3061 |** | 32 -> 63 : 4028 |*** | 64 -> 127 : 9312 |******* | 128 -> 255 : 51372 |****************************************| 256 -> 511 : 28574 |********************** | Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220705233715.759-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 526b872cc710..784b4b30ce9a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -135,7 +135,7 @@ static unsigned int unix_unbound_hash(struct sock *sk) hash ^= hash >> 8; hash ^= sk->sk_type; - return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); + return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) @@ -153,16 +153,17 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, hash ^= hash >> 8; hash ^= type; - return hash & UNIX_HASH_MOD; + return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static void unix_table_double_lock(struct net *net, unsigned int hash1, unsigned int hash2) { - /* hash1 and hash2 is never the same because - * one is between 0 and UNIX_HASH_MOD, and - * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1. - */ + if (hash1 == hash2) { + spin_lock(&net->unx.table.locks[hash1]); + return; + } + if (hash1 > hash2) swap(hash1, hash2); @@ -173,6 +174,11 @@ static void unix_table_double_lock(struct net *net, static void unix_table_double_unlock(struct net *net, unsigned int hash1, unsigned int hash2) { + if (hash1 == hash2) { + spin_unlock(&net->unx.table.locks[hash1]); + return; + } + spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); } -- cgit v1.2.3