From f24b9be5957b38bb420b838115040dc2031b7d0c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:45 -0400 Subject: net-timestamp: extend SCM_TIMESTAMPING ancillary data struct Applications that request kernel tx timestamps with SO_TIMESTAMPING read timestamps as recvmsg() ancillary data. The response is defined implicitly as timespec[3]. 1) define struct scm_timestamping explicitly and 2) add support for new tstamp types. On tx, scm_timestamping always accompanies a sock_extended_err. Define previously unused field ee_info to signal the type of ts[0]. Introduce SCM_TSTAMP_SND to define the existing behavior. The reception path is not modified. On rx, no struct similar to sock_extended_err is passed along with SCM_TIMESTAMPING. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ include/net/sock.h | 4 +++- include/uapi/linux/errqueue.h | 18 ++++++++++++++++++ net/core/skbuff.c | 1 + net/socket.c | 20 +++++++++++--------- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 281deced7469..477f0f60db45 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -249,6 +249,9 @@ enum { SKBTX_SHARED_FRAG = 1 << 5, }; +#define SKBTX_ANY_SW_TSTAMP SKBTX_SW_TSTAMP +#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) + /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. diff --git a/include/net/sock.h b/include/net/sock.h index b91c8868ab8d..02f5b35e65f1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2169,7 +2169,9 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) */ if (sock_flag(sk, SOCK_RCVTSTAMP) || sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) || - (kt.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) || + (kt.tv64 && + (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) || + skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP)) || (hwtstamps->hwtstamp.tv64 && sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index aacd4fb7102a..accee72cae7c 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -22,5 +22,23 @@ struct sock_extended_err { #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) +/** + * struct scm_timestamping - timestamps exposed through cmsg + * + * The timestamping interfaces SO_TIMESTAMPING, MSG_TSTAMP_* + * communicate network timestamps by passing this struct in a cmsg with + * recvmsg(). See Documentation/networking/timestamping.txt for details. + */ +struct scm_timestamping { + struct timespec ts[3]; +}; + +/* The type of scm_timestamping, passed in sock_extended_err ee_info. + * This defines the type of ts[0]. For SCM_TSTAMP_SND only, if ts[0] + * is zero, then this is a hardware timestamp and recorded in ts[2]. + */ +enum { + SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */ +}; #endif /* _UAPI_LINUX_ERRQUEUE_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a33033cbe2..c9f68802e992 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3521,6 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + serr->ee.ee_info = SCM_TSTAMP_SND; err = sock_queue_err_skb(sk, skb); diff --git a/net/socket.c b/net/socket.c index d8222c025061..dc0cc5d95ee5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -106,6 +106,7 @@ #include #include #include +#include #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; @@ -697,7 +698,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); - struct timespec ts[3]; + struct scm_timestamping tss; int empty = 1; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); @@ -714,24 +715,25 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, sizeof(tv), &tv); } else { - skb_get_timestampns(skb, &ts[0]); + struct timespec ts; + skb_get_timestampns(skb, &ts); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS, - sizeof(ts[0]), &ts[0]); + sizeof(ts), &ts); } } - - memset(ts, 0, sizeof(ts)); - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) && - ktime_to_timespec_cond(skb->tstamp, ts + 0)) + memset(&tss, 0, sizeof(tss)); + if ((sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) || + skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP) && + ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts + 2)) + ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) empty = 0; if (!empty) put_cmsg(msg, SOL_SOCKET, - SCM_TIMESTAMPING, sizeof(ts), &ts); + SCM_TIMESTAMPING, sizeof(tss), &tss); } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); -- cgit v1.2.3 From b9f40e21ef4298650ab33e35740fa85bd57706d5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:46 -0400 Subject: net-timestamp: move timestamp flags out of sk_flags sk_flags is reaching its limit. New timestamping options will not fit. Move all of them into a new field sk->sk_tsflags. Added benefit is that this removes boilerplate code to convert between SOF_TIMESTAMPING_.. and SOCK_TIMESTAMPING_.. in getsockopt/setsockopt. SOCK_TIMESTAMPING_RX_SOFTWARE is also used to toggle the receive timestamp logic (netstamp_needed). That can be simplified and this last key removed, but will leave that for a separate patch. Signed-off-by: Willem de Bruijn ---- The u16 in sock can be moved into a 16-bit hole below sk_gso_max_segs, though that scatters tstamp fields throughout the struct. Signed-off-by: David S. Miller --- include/net/sock.h | 29 +++++++++++------------------ net/core/sock.c | 25 ++----------------------- net/socket.c | 8 ++++---- 3 files changed, 17 insertions(+), 45 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 02f5b35e65f1..a21129716aae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -67,6 +67,7 @@ #include #include #include +#include struct cgroup; struct cgroup_subsys; @@ -278,6 +279,7 @@ struct cg_proto; * @sk_protinfo: private area, net family specific, when not using slab * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received + * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag @@ -411,6 +413,7 @@ struct sock { void *sk_protinfo; struct timer_list sk_timer; ktime_t sk_stamp; + u16 sk_tsflags; struct socket *sk_socket; void *sk_user_data; struct page_frag sk_frag; @@ -701,12 +704,7 @@ enum sock_flags { SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ - SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ - SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ - SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ - SOCK_TIMESTAMPING_SOFTWARE, /* %SOF_TIMESTAMPING_SOFTWARE */ - SOCK_TIMESTAMPING_RAW_HARDWARE, /* %SOF_TIMESTAMPING_RAW_HARDWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ @@ -2160,20 +2158,17 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) /* * generate control messages if - * - receive time stamping in software requested (SOCK_RCVTSTAMP - * or SOCK_TIMESTAMPING_RX_SOFTWARE) + * - receive time stamping in software requested * - software time stamp available and wanted - * (SOCK_TIMESTAMPING_SOFTWARE) * - hardware time stamps available and wanted - * SOCK_TIMESTAMPING_RAW_HARDWARE */ if (sock_flag(sk, SOCK_RCVTSTAMP) || - sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE) || + (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt.tv64 && - (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) || + (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE || skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP)) || (hwtstamps->hwtstamp.tv64 && - sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))) + (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sk->sk_stamp = kt; @@ -2189,11 +2184,11 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ - (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_TIMESTAMPING_SOFTWARE) | \ - (1UL << SOCK_TIMESTAMPING_RAW_HARDWARE)) + (1UL << SOCK_RCVTSTAMP)) +#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ + SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_TS_OR_DROPS) + if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) __sock_recv_ts_and_drops(msg, sk, skb); else sk->sk_stamp = skb->tstamp; @@ -2203,8 +2198,6 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, * sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @tx_flags: filled with instructions for time stamping - * - * Currently only depends on SOCK_TIMESTAMPING* flags. */ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags); diff --git a/net/core/sock.c b/net/core/sock.c index a741163568fa..47c9377e14b9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -848,22 +848,13 @@ set_rcvbuf: ret = -EINVAL; break; } - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, - val & SOF_TIMESTAMPING_TX_HARDWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, - val & SOF_TIMESTAMPING_TX_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, - val & SOF_TIMESTAMPING_RX_HARDWARE); + sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, - val & SOF_TIMESTAMPING_SOFTWARE); - sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, - val & SOF_TIMESTAMPING_RAW_HARDWARE); break; case SO_RCVLOWAT: @@ -1089,19 +1080,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_TIMESTAMPING: - v.val = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_TX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RX_HARDWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) - v.val |= SOF_TIMESTAMPING_SOFTWARE; - if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) - v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + v.val = sk->sk_tsflags; break; case SO_RCVTIMEO: diff --git a/net/socket.c b/net/socket.c index dc0cc5d95ee5..255d9b802723 100644 --- a/net/socket.c +++ b/net/socket.c @@ -613,9 +613,9 @@ EXPORT_SYMBOL(sock_release); void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) { *tx_flags = 0; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) *tx_flags |= SKBTX_HW_TSTAMP; - if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) *tx_flags |= SKBTX_SW_TSTAMP; if (sock_flag(sk, SOCK_WIFI_STATUS)) *tx_flags |= SKBTX_WIFI_STATUS; @@ -723,12 +723,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, } memset(&tss, 0, sizeof(tss)); - if ((sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) || + if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE || skb_shinfo(skb)->tx_flags & SKBTX_ANY_SW_TSTAMP) && ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && - sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) && + (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) empty = 0; if (!empty) -- cgit v1.2.3 From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:47 -0400 Subject: net-timestamp: add key to disambiguate concurrent datagrams Datagrams timestamped on transmission can coexist in the kernel stack and be reordered in packet scheduling. When reading looped datagrams from the socket error queue it is not always possible to unique correlate looped data with original send() call (for application level retransmits). Even if possible, it may be expensive and complex, requiring packet inspection. Introduce a data-independent ID mechanism to associate timestamps with send calls. Pass an ID alongside the timestamp in field ee_data of sock_extended_err. The ID is a simple 32 bit unsigned int that is associated with the socket and incremented on each send() call for which software tx timestamp generation is enabled. The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to avoid changing ee_data for existing applications that expect it 0. The counter is reset each time the flag is reenabled. Reenabling does not change the ID of already submitted data. It is possible to receive out of order IDs if the timestamp stream is not quiesced first. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/sock.h | 2 ++ include/uapi/linux/net_tstamp.h | 8 +++++--- net/core/skbuff.c | 2 ++ net/core/sock.c | 3 +++ net/ipv4/ip_output.c | 6 ++++++ net/ipv6/ip6_output.c | 9 ++++++++- 7 files changed, 27 insertions(+), 4 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 477f0f60db45..0e35b3af7317 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -278,6 +278,7 @@ struct skb_shared_info { unsigned short gso_type; struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; + u32 tskey; __be32 ip6_frag_id; /* diff --git a/include/net/sock.h b/include/net/sock.h index a21129716aae..52fe0bc5598a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -280,6 +280,7 @@ struct cg_proto; * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_tsflags: SO_TIMESTAMPING socket options + * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag @@ -414,6 +415,7 @@ struct sock { struct timer_list sk_timer; ktime_t sk_stamp; u16 sk_tsflags; + u32 sk_tskey; struct socket *sk_socket; void *sk_user_data; struct page_frag sk_frag; diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index f53879c0f590..1e861d2e1a31 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -20,9 +20,11 @@ enum { SOF_TIMESTAMPING_SOFTWARE = (1<<4), SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5), SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), - SOF_TIMESTAMPING_MASK = - (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | - SOF_TIMESTAMPING_RAW_HARDWARE + SOF_TIMESTAMPING_OPT_ID = (1<<7), + + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID, + SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | + SOF_TIMESTAMPING_LAST }; /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c9f68802e992..0df4f1d14c5a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3522,6 +3522,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SND; + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + serr->ee.ee_data = skb_shinfo(skb)->tskey; err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 47c9377e14b9..1e0f1c63ad6b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -848,6 +848,9 @@ set_rcvbuf: ret = -EINVAL; break; } + if (val & SOF_TIMESTAMPING_OPT_ID && + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) + sk->sk_tskey = 0; sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b16556836d66..215af2b155cb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->fragsize; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -976,6 +980,8 @@ alloc_new_skb: /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f5dafe609f8b..315a55d66079 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1157,6 +1157,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int err; int offset = 0; __u8 tx_flags = 0; + u32 tskey = 0; if (flags&MSG_PROBE) return 0; @@ -1272,8 +1273,12 @@ emsgsize: } } - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) + if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { sock_tx_timestamp(sk, &tx_flags); + if (tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + } /* * Let's try using as much space as possible. @@ -1397,6 +1402,8 @@ alloc_new_skb: /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = tx_flags; tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; /* * Find where to start putting bytes -- cgit v1.2.3 From e7fd2885385157d46c85f282fc6d7d297db43e1f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:48 -0400 Subject: net-timestamp: SCHED timestamp on entering packet scheduler Kernel transmit latency is often incurred in the packet scheduler. Introduce a new timestamp on transmission just before entering the scheduler. When data travels through multiple devices (bonding, tunneling, ...) each device will export an individual timestamp. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++-- include/uapi/linux/errqueue.h | 1 + include/uapi/linux/net_tstamp.h | 3 ++- net/core/dev.c | 4 ++++ net/core/skbuff.c | 16 ++++++++++++---- net/socket.c | 3 +++ 6 files changed, 31 insertions(+), 7 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0e35b3af7317..50e1e9b3a5a5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -229,7 +229,7 @@ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, - /* generate software time stamp */ + /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ @@ -247,9 +247,12 @@ enum { * all frags to avoid possible bad checksum */ SKBTX_SHARED_FRAG = 1 << 5, + + /* generate software time stamp when entering packet scheduling */ + SKBTX_SCHED_TSTAMP = 1 << 6, }; -#define SKBTX_ANY_SW_TSTAMP SKBTX_SW_TSTAMP +#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) /* @@ -2695,6 +2698,10 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype); + /** * skb_tstamp_tx - queue clone of skb with send time stamps * @orig_skb: the original outgoing packet diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index accee72cae7c..17437cf297b7 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -39,6 +39,7 @@ struct scm_timestamping { */ enum { SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */ + SCM_TSTAMP_SCHED, /* data entered the packet scheduler */ }; #endif /* _UAPI_LINUX_ERRQUEUE_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 1e861d2e1a31..60733845fcdd 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -21,8 +21,9 @@ enum { SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5), SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), SOF_TIMESTAMPING_OPT_ID = (1<<7), + SOF_TIMESTAMPING_TX_SCHED = (1<<8), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_SCHED, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/dev.c b/net/core/dev.c index b370230fe1d3..1c15b189c52b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -132,6 +132,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2876,6 +2877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb_reset_mac_header(skb); + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0df4f1d14c5a..9705c0732aab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3490,10 +3490,10 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(sock_queue_err_skb); -void skb_tstamp_tx(struct sk_buff *orig_skb, - struct skb_shared_hwtstamps *hwtstamps) +void __skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, int tstype) { - struct sock *sk = orig_skb->sk; struct sock_exterr_skb *serr; struct sk_buff *skb; int err; @@ -3521,7 +3521,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; - serr->ee.ee_info = SCM_TSTAMP_SND; + serr->ee.ee_info = tstype; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = skb_shinfo(skb)->tskey; @@ -3530,6 +3530,14 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, if (err) kfree_skb(skb); } +EXPORT_SYMBOL_GPL(__skb_tstamp_tx); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk, + SCM_TSTAMP_SND); +} EXPORT_SYMBOL_GPL(skb_tstamp_tx); void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) diff --git a/net/socket.c b/net/socket.c index 255d9b802723..3a2778d71631 100644 --- a/net/socket.c +++ b/net/socket.c @@ -617,6 +617,9 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) *tx_flags |= SKBTX_HW_TSTAMP; if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) *tx_flags |= SKBTX_SW_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) + *tx_flags |= SKBTX_SCHED_TSTAMP; + if (sock_flag(sk, SOCK_WIFI_STATUS)) *tx_flags |= SKBTX_WIFI_STATUS; } -- cgit v1.2.3 From 4ed2d765dfaccff5ebdac68e2064b59125033a3b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:49 -0400 Subject: net-timestamp: TCP timestamping TCP timestamping extends SO_TIMESTAMPING to bytestreams. Bytestreams do not have a 1:1 relationship between send() buffers and network packets. The feature interprets a send call on a bytestream as a request for a timestamp for the last byte in that send() buffer. The choice corresponds to a request for a timestamp when all bytes in the buffer have been sent. That assumption depends on in-order kernel transmission. This is the common case. That said, it is possible to construct a traffic shaping tree that would result in reordering. The guarantee is strong, then, but not ironclad. This implementation supports send and sendpages (splice). GSO replaces one large packet with multiple smaller packets. This patch also copies the option into the correct smaller packet. This patch does not yet support timestamping on data in an initial TCP Fast Open SYN, because that takes a very different data path. If ID generation in ee_data is enabled, bytestream timestamps return a byte offset, instead of the packet counter for datagrams. The implementation supports a single timestamp per packet. It silenty replaces requests for previous timestamps. To avoid missing tstamps, flush the tcp queue by disabling Nagle, cork and autocork. Missing tstamps can be detected by offset when the ee_data ID is enabled. Implementation details: - On GSO, the timestamping code can be included in the main loop. I moved it into its own loop to reduce the impact on the common case to a single branch. - To avoid leaking the absolute seqno to userspace, the offset returned in ee_data must always be relative. It is an offset between an skb and sk field. The first is always set (also for GSO & ACK). The second must also never be uninitialized. Only allow the ID option on sockets in the ESTABLISHED state, for which the seqno is available. Never reset it to zero (instead, move it to the current seqno when reenabling the option). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 ++++- net/core/sock.c | 13 +++++++++++-- net/ipv4/tcp.c | 22 +++++++++++++++++++--- net/ipv4/tcp_offload.c | 18 ++++++++++++++++++ 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9705c0732aab..3dec0293a7c5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3522,8 +3522,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk->sk_protocol == IPPROTO_TCP) + serr->ee.ee_data -= sk->sk_tskey; + } err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 1e0f1c63ad6b..2714811afbd8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -849,8 +849,17 @@ set_rcvbuf: break; } if (val & SOF_TIMESTAMPING_OPT_ID && - !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) - sk->sk_tskey = 0; + !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_state != TCP_ESTABLISHED) { + ret = -EINVAL; + break; + } + sk->sk_tskey = tcp_sk(sk)->snd_una; + } else { + sk->sk_tskey = 0; + } + } sk->sk_tsflags = val; if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d2118e5fbc7..744af67a5989 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); +void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + sock_tx_timestamp(sk, &shinfo->tx_flags); + if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP) + shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; +} + /* * Wait for a TCP event. * @@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); - if (sk->sk_err) + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; return mask; @@ -959,8 +968,10 @@ new_segment: copied += copy; offset += copy; - if (!(size -= copy)) + if (!(size -= copy)) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < size_goal || (flags & MSG_OOB)) continue; @@ -1252,8 +1263,10 @@ new_segment: from += copy; copied += copy; - if ((seglen -= copy) == 0 && iovlen == 0) + if ((seglen -= copy) == 0 && iovlen == 0) { + tcp_tx_timestamp(sk, skb); goto out; + } if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) continue; @@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; + if (unlikely(flags & MSG_ERRQUEUE)) + return ip_recv_error(sk, msg, len, addr_len); + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) sk_busy_loop(sk, nonblock); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 55046ecd083e..f597119fc4e7 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -14,6 +14,21 @@ #include #include +void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq, + unsigned int mss) +{ + while (skb) { + if (ts_seq < (__u64) seq + mss) { + skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP; + skb_shinfo(skb)->tskey = ts_seq; + return; + } + + skb = skb->next; + seq += mss; + } +} + struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { @@ -91,6 +106,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); + if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) + tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); + newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); -- cgit v1.2.3 From e1c8a607b28190cd09a271508aa3025d3c2f312e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 Aug 2014 22:11:50 -0400 Subject: net-timestamp: ACK timestamp for bytestreams Add SOF_TIMESTAMPING_TX_ACK, a request for a tstamp when the last byte in the send() call is acknowledged. It implements the feature for TCP. The timestamp is generated when the TCP socket cumulative ACK is moved beyond the tracked seqno for the first time. The feature ignores SACK and FACK, because those acknowledge the specific byte, but not necessarily the entire contents of the buffer up to that byte. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ++++++- include/uapi/linux/errqueue.h | 1 + include/uapi/linux/net_tstamp.h | 3 ++- net/ipv4/tcp_input.c | 6 ++++++ net/socket.c | 2 ++ 5 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50e1e9b3a5a5..11c270551d25 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -250,9 +250,14 @@ enum { /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, + + /* generate software timestamp on peer data acknowledgment */ + SKBTX_ACK_TSTAMP = 1 << 7, }; -#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | SKBTX_SCHED_TSTAMP) +#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ + SKBTX_SCHED_TSTAMP | \ + SKBTX_ACK_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) /* diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 17437cf297b7..07bdce1f444a 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -40,6 +40,7 @@ struct scm_timestamping { enum { SCM_TSTAMP_SND, /* driver passed skb to NIC, or HW */ SCM_TSTAMP_SCHED, /* data entered the packet scheduler */ + SCM_TSTAMP_ACK, /* data acknowledged by peer */ }; #endif /* _UAPI_LINUX_ERRQUEUE_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 60733845fcdd..ff354021bb69 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -22,8 +22,9 @@ enum { SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), SOF_TIMESTAMPING_OPT_ID = (1<<7), SOF_TIMESTAMPING_TX_SCHED = (1<<8), + SOF_TIMESTAMPING_TX_ACK = (1<<9), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_SCHED, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6a2984507755..a3d47af01906 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -74,6 +74,7 @@ #include #include #include +#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -3106,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_stamp = 0; } + if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) && + between(skb_shinfo(skb)->tskey, prior_snd_una, + tp->snd_una + 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + if (!fully_acked) break; diff --git a/net/socket.c b/net/socket.c index 3a2778d71631..ae89569a2db5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -619,6 +619,8 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) *tx_flags |= SKBTX_SW_TSTAMP; if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) *tx_flags |= SKBTX_SCHED_TSTAMP; + if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) + *tx_flags |= SKBTX_ACK_TSTAMP; if (sock_flag(sk, SOCK_WIFI_STATUS)) *tx_flags |= SKBTX_WIFI_STATUS; -- cgit v1.2.3