summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.rst8
-rw-r--r--include/net/sock.h19
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c12
-rw-r--r--net/ipv4/tcp.c38
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/mptcp/protocol.c137
9 files changed, 85 insertions, 149 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index d91ab28718d4..16b8bf72feaf 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -989,14 +989,6 @@ tcp_challenge_ack_limit - INTEGER
in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
Default: 1000
-tcp_rx_skb_cache - BOOLEAN
- Controls a per TCP socket cache of one skb, that might help
- performance of some workloads. This might be dangerous
- on systems with a lot of TCP sockets, since it increases
- memory usage.
-
- Default: 0 (disabled)
-
UDP variables
=============
diff --git a/include/net/sock.h b/include/net/sock.h
index 66a9a90f9558..708b9de3cdbb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -262,7 +262,6 @@ struct bpf_local_storage;
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
- * @sk_rx_skb_cache: cache copy of recently accessed RX skb
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -328,7 +327,6 @@ struct bpf_local_storage;
* @sk_peek_off: current peek_offset value
* @sk_send_head: front of stuff to transmit
* @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
- * @sk_tx_skb_cache: cache copy of recently accessed TX skb
* @sk_security: used by security modules
* @sk_mark: generic packet mark
* @sk_cgrp_data: cgroup data for this cgroup
@@ -393,7 +391,6 @@ struct sock {
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
- struct sk_buff *sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -442,7 +439,6 @@ struct sock {
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
- struct sk_buff *sk_tx_skb_cache;
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
@@ -1555,18 +1551,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
__sk_mem_reclaim(sk, 1 << 20);
}
-DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
sk_mem_uncharge(sk, skb->truesize);
- if (static_branch_unlikely(&tcp_tx_skb_cache_key) &&
- !sk->sk_tx_skb_cache && !skb_cloned(skb)) {
- skb_ext_reset(skb);
- skb_zcopy_clear(skb, true);
- sk->sk_tx_skb_cache = skb;
- return;
- }
__kfree_skb(skb);
}
@@ -2575,7 +2563,6 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
&skb_shinfo(skb)->tskey);
}
-DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2587,12 +2574,6 @@ DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_unlink(skb, &sk->sk_receive_queue);
- if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
- !sk->sk_rx_skb_cache) {
- sk->sk_rx_skb_cache = skb;
- skb_orphan(skb);
- return;
- }
__kfree_skb(skb);
}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3166dc15d7d6..673c3b01e287 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -330,8 +330,6 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
-struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
- struct page *page, int offset, size_t *size);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
@@ -581,6 +579,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb);
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle);
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1d816a5fd3eb..40558033f857 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -133,10 +133,6 @@ void inet_sock_destruct(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- if (sk->sk_rx_skb_cache) {
- __kfree_skb(sk->sk_rx_skb_cache);
- sk->sk_rx_skb_cache = NULL;
- }
__skb_queue_purge(&sk->sk_error_queue);
sk_mem_reclaim(sk);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4680268f2e59..97eb54774924 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -585,18 +585,6 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &sysctl_fib_sync_mem_min,
.extra2 = &sysctl_fib_sync_mem_max,
},
- {
- .procname = "tcp_rx_skb_cache",
- .data = &tcp_rx_skb_cache_key.key,
- .mode = 0644,
- .proc_handler = proc_do_static_key,
- },
- {
- .procname = "tcp_tx_skb_cache",
- .data = &tcp_tx_skb_cache_key.key,
- .mode = 0644,
- .proc_handler = proc_do_static_key,
- },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e8b48df73c85..414c179c28e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -325,11 +325,6 @@ struct tcp_splice_state {
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);
-DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
-EXPORT_SYMBOL(tcp_rx_skb_cache_key);
-
-DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
-
void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
@@ -647,7 +642,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
EXPORT_SYMBOL(tcp_ioctl);
-static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
@@ -658,7 +653,7 @@ static inline bool forced_push(const struct tcp_sock *tp)
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static void skb_entail(struct sock *sk, struct sk_buff *skb)
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -866,18 +861,6 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
{
struct sk_buff *skb;
- if (likely(!size)) {
- skb = sk->sk_tx_skb_cache;
- if (skb) {
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- sk->sk_tx_skb_cache = NULL;
- pskb_trim(skb, 0);
- INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
- skb_shinfo(skb)->tx_flags = 0;
- memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
- return skb;
- }
- }
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
@@ -963,8 +946,8 @@ void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
}
}
-struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
- struct page *page, int offset, size_t *size)
+static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+ struct page *page, int offset, size_t *size)
{
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -985,7 +968,7 @@ new_segment:
#ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
- skb_entail(sk, skb);
+ tcp_skb_entail(sk, skb);
copy = size_goal;
}
@@ -1314,7 +1297,7 @@ new_segment:
process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
- skb_entail(sk, skb);
+ tcp_skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
@@ -2920,11 +2903,6 @@ void tcp_write_queue_purge(struct sock *sk)
sk_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
- skb = sk->sk_tx_skb_cache;
- if (skb) {
- __kfree_skb(skb);
- sk->sk_tx_skb_cache = NULL;
- }
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2961,10 +2939,6 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- if (sk->sk_rx_skb_cache) {
- __kfree_skb(sk->sk_rx_skb_cache);
- sk->sk_rx_skb_cache = NULL;
- }
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->urg_data = 0;
tcp_write_queue_purge(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e62e0d6373a..29a57bd159f0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1941,7 +1941,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
- struct sk_buff *skb_to_free;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
const struct iphdr *iph;
@@ -2082,17 +2081,12 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- skb_to_free = sk->sk_rx_skb_cache;
- sk->sk_rx_skb_cache = NULL;
ret = tcp_v4_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
- skb_to_free = NULL;
}
bh_unlock_sock(sk);
- if (skb_to_free)
- __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0ce52d46e4f8..8cf5ff2e9504 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1618,7 +1618,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
- struct sk_buff *skb_to_free;
int sdif = inet6_sdif(skb);
int dif = inet6_iif(skb);
const struct tcphdr *th;
@@ -1754,17 +1753,12 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- skb_to_free = sk->sk_rx_skb_cache;
- sk->sk_rx_skb_cache = NULL;
ret = tcp_v6_do_rcv(sk, skb);
} else {
if (tcp_add_backlog(sk, skb))
goto discard_and_relse;
- skb_to_free = NULL;
}
bh_unlock_sock(sk);
- if (skb_to_free)
- __kfree_skb(skb_to_free);
put_and_return:
if (refcounted)
sock_put(sk);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index dbcebf56798f..7e5f76092b64 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1224,6 +1224,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
if (likely(__mptcp_add_ext(skb, gfp))) {
skb_reserve(skb, MAX_TCP_HEADER);
skb->reserved_tailroom = skb->end - skb->tail;
+ INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
__kfree_skb(skb);
@@ -1233,31 +1234,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
return NULL;
}
-static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
+static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
{
struct sk_buff *skb;
- if (ssk->sk_tx_skb_cache) {
- skb = ssk->sk_tx_skb_cache;
- if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
- !__mptcp_add_ext(skb, gfp)))
- return false;
- return true;
- }
-
skb = __mptcp_do_alloc_tx_skb(sk, gfp);
if (!skb)
- return false;
+ return NULL;
if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
- ssk->sk_tx_skb_cache = skb;
- return true;
+ tcp_skb_entail(ssk, skb);
+ return skb;
}
kfree_skb(skb);
- return false;
+ return NULL;
}
-static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
+static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
{
gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
@@ -1287,23 +1280,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info)
{
u64 data_seq = dfrag->data_seq + info->sent;
+ int offset = dfrag->offset + info->sent;
struct mptcp_sock *msk = mptcp_sk(sk);
bool zero_window_probe = false;
struct mptcp_ext *mpext = NULL;
- struct sk_buff *skb, *tail;
- bool must_collapse = false;
- int size_bias = 0;
- int avail_size;
- size_t ret = 0;
+ bool can_coalesce = false;
+ bool reuse_skb = true;
+ struct sk_buff *skb;
+ size_t copy;
+ int i;
pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
+ if (WARN_ON_ONCE(info->sent > info->limit ||
+ info->limit > dfrag->data_len))
+ return 0;
+
/* compute send limit */
info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
- avail_size = info->size_goal;
+ copy = info->size_goal;
+
skb = tcp_write_queue_tail(ssk);
- if (skb) {
+ if (skb && copy > skb->len) {
/* Limit the write to the size available in the
* current skb, if any, so that we create at most a new skb.
* Explicitly tells TCP internals to avoid collapsing on later
@@ -1316,62 +1315,80 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
goto alloc_skb;
}
- must_collapse = (info->size_goal > skb->len) &&
- (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags);
- if (must_collapse) {
- size_bias = skb->len;
- avail_size = info->size_goal - skb->len;
+ i = skb_shinfo(skb)->nr_frags;
+ can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
+ tcp_mark_push(tcp_sk(ssk), skb);
+ goto alloc_skb;
}
- }
+ copy -= skb->len;
+ } else {
alloc_skb:
- if (!must_collapse &&
- !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held))
- return 0;
+ skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
+ if (!skb)
+ return -ENOMEM;
+
+ i = skb_shinfo(skb)->nr_frags;
+ reuse_skb = false;
+ mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
+ }
/* Zero window and all data acked? Probe. */
- avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size);
- if (avail_size == 0) {
+ copy = mptcp_check_allowed_size(msk, data_seq, copy);
+ if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
- if (skb || snd_una != msk->snd_nxt)
+ if (snd_una != msk->snd_nxt) {
+ tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
return 0;
+ }
+
zero_window_probe = true;
data_seq = snd_una - 1;
- avail_size = 1;
- }
+ copy = 1;
- if (WARN_ON_ONCE(info->sent > info->limit ||
- info->limit > dfrag->data_len))
- return 0;
+ /* all mptcp-level data is acked, no skbs should be present into the
+ * ssk write queue
+ */
+ WARN_ON_ONCE(reuse_skb);
+ }
- ret = info->limit - info->sent;
- tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
- dfrag->page, dfrag->offset + info->sent, &ret);
- if (!tail) {
- tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
+ copy = min_t(size_t, copy, info->limit - info->sent);
+ if (!sk_wmem_schedule(ssk, copy)) {
+ tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
return -ENOMEM;
}
- /* if the tail skb is still the cached one, collapsing really happened.
- */
- if (skb == tail) {
- TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
- mpext->data_len += ret;
+ if (can_coalesce) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ get_page(dfrag->page);
+ skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
+ }
+
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->truesize += copy;
+ sk_wmem_queued_add(ssk, copy);
+ sk_mem_charge(ssk, copy);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
+ TCP_SKB_CB(skb)->end_seq += copy;
+ tcp_skb_pcount_set(skb, 0);
+
+ /* on skb reuse we just need to update the DSS len */
+ if (reuse_skb) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
+ mpext->data_len += copy;
WARN_ON_ONCE(zero_window_probe);
goto out;
}
- mpext = skb_ext_find(tail, SKB_EXT_MPTCP);
- if (WARN_ON_ONCE(!mpext)) {
- /* should never reach here, stream corrupted */
- return -EINVAL;
- }
-
memset(mpext, 0, sizeof(*mpext));
mpext->data_seq = data_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
- mpext->data_len = ret;
+ mpext->data_len = copy;
mpext->use_map = 1;
mpext->dsn64 = 1;
@@ -1380,18 +1397,18 @@ alloc_skb:
mpext->dsn64);
if (zero_window_probe) {
- mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
+ mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
mpext->frozen = 1;
if (READ_ONCE(msk->csum_enabled))
- mptcp_update_data_checksum(tail, ret);
+ mptcp_update_data_checksum(skb, copy);
tcp_push_pending_frames(ssk);
return 0;
}
out:
if (READ_ONCE(msk->csum_enabled))
- mptcp_update_data_checksum(tail, ret);
- mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
- return ret;
+ mptcp_update_data_checksum(skb, copy);
+ mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
+ return copy;
}
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \