From 2a916ecc405686c1d86f632281bc06aa75ebae4e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 19 Jun 2020 03:32:48 +0000 Subject: net/devlink: Support querying hardware address of port function PCI PF and VF devlink port can manage the function represented by a devlink port. Enable users to query port function's hardware address. Example of a PCI VF port which supports a port function: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:11:22:33:44:66 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "enp6s0pf0vf1", "flavour": "pcivf", "pfnum": 0, "vfnum": 1, "function": { "hw_addr": "00:11:22:33:44:66" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 08563e6a424d..07d0af8f5923 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -451,6 +451,8 @@ enum devlink_attr { DEVLINK_ATTR_TRAP_POLICER_RATE, /* u64 */ DEVLINK_ATTR_TRAP_POLICER_BURST, /* u64 */ + DEVLINK_ATTR_PORT_FUNCTION, /* nested */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, @@ -497,4 +499,12 @@ enum devlink_resource_unit { DEVLINK_RESOURCE_UNIT_ENTRY, }; +enum devlink_port_function_attr { + DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, + DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ + + __DEVLINK_PORT_FUNCTION_ATTR_MAX, + DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ -- cgit v1.2.3 From b5872cd0e823e4cb50b3a75cd9522167eeb676a2 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Sat, 20 Jun 2020 22:01:56 +0530 Subject: devlink: Add support for board.serial_number to info_get cb. Board serial number is a serial number, often available in PCI *Vital Product Data*. Also, update devlink-info.rst documentation file. Cc: Jiri Pirko Cc: Jakub Kicinski Signed-off-by: Vasundhara Volam Reviewed-by: Michael Chan Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-info.rst | 12 +++++------- include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 8 ++++++++ 4 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst index 3fe11401b838..7572bf6de5c1 100644 --- a/Documentation/networking/devlink/devlink-info.rst +++ b/Documentation/networking/devlink/devlink-info.rst @@ -44,9 +44,11 @@ versions is generally discouraged - here, and via any other Linux API. reported for two ports of the same device or on two hosts of a multi-host device should be identical. - .. note:: ``devlink-info`` API should be extended with a new field - if devices want to report board/product serial number (often - reported in PCI *Vital Product Data* capability). + * - ``board.serial_number`` + - Board serial number of the device. + + This is usually the serial number of the board, often available in + PCI *Vital Product Data*. * - ``fixed`` - Group for hardware identifiers, and versions of components @@ -201,10 +203,6 @@ Future work The following extensions could be useful: - - product serial number - NIC boards often get labeled with a board serial - number rather than ASIC serial number; it'd be useful to add board serial - numbers to the API if they can be retrieved from the device; - - on-disk firmware file names - drivers list the file names of firmware they may need to load onto devices via the ``MODULE_FIRMWARE()`` macro. These, however, are per module, rather than per device. It'd be useful to list diff --git a/include/net/devlink.h b/include/net/devlink.h index 7007f93585a5..428f55f8197c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1284,6 +1284,8 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn); int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_name, const char *version_value); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 07d0af8f5923..87c83a82991b 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -453,6 +453,8 @@ enum devlink_attr { DEVLINK_ATTR_PORT_FUNCTION, /* nested */ + DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index baa45eca6b5a..455998a57671 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4502,6 +4502,14 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); +int devlink_info_board_serial_number_put(struct devlink_info_req *req, + const char *bsn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, + bsn); +} +EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); + static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, const char *version_value) -- cgit v1.2.3 From 79a28ddd18e9c653f13f60dfabee15c024e64b9b Mon Sep 17 00:00:00 2001 From: Alexandre Cassen Date: Tue, 23 Jun 2020 10:33:45 +0200 Subject: rtnetlink: add keepalived rtm_protocol Keepalived can set global static ip routes or virtual ip routes dynamically following VRRP protocol states. Using a dedicated rtm_protocol will help keeping track of it. Changes in v2: - fix tab/space indenting Signed-off-by: Alexandre Cassen Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 073e71ef6bdd..879e64950a0a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -257,12 +257,12 @@ enum { /* rtm_protocol */ -#define RTPROT_UNSPEC 0 -#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; - not used by current IPv4 */ -#define RTPROT_KERNEL 2 /* Route installed by kernel */ -#define RTPROT_BOOT 3 /* Route installed during boot */ -#define RTPROT_STATIC 4 /* Route installed by administrator */ +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; + not used by current IPv4 */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ /* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; they are just passed from user and back as is. @@ -271,22 +271,23 @@ enum { avoid conflicts. */ -#define RTPROT_GATED 8 /* Apparently, GateD */ -#define RTPROT_RA 9 /* RDISC/ND router advertisements */ -#define RTPROT_MRT 10 /* Merit MRT */ -#define RTPROT_ZEBRA 11 /* Zebra */ -#define RTPROT_BIRD 12 /* BIRD */ -#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ -#define RTPROT_XORP 14 /* XORP */ -#define RTPROT_NTK 15 /* Netsukuku */ -#define RTPROT_DHCP 16 /* DHCP client */ -#define RTPROT_MROUTED 17 /* Multicast daemon */ -#define RTPROT_BABEL 42 /* Babel daemon */ -#define RTPROT_BGP 186 /* BGP Routes */ -#define RTPROT_ISIS 187 /* ISIS Routes */ -#define RTPROT_OSPF 188 /* OSPF Routes */ -#define RTPROT_RIP 189 /* RIP Routes */ -#define RTPROT_EIGRP 192 /* EIGRP Routes */ +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC/ND router advertisements */ +#define RTPROT_MRT 10 /* Merit MRT */ +#define RTPROT_ZEBRA 11 /* Zebra */ +#define RTPROT_BIRD 12 /* BIRD */ +#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ +#define RTPROT_XORP 14 /* XORP */ +#define RTPROT_NTK 15 /* Netsukuku */ +#define RTPROT_DHCP 16 /* DHCP client */ +#define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ /* rtm_scope -- cgit v1.2.3 From bdb7b79b4ce864a724250e1d35948c46f135de36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 22 Jun 2020 20:22:21 -0700 Subject: bpf: Switch most helper return values from 32-bit int to 64-bit long Switch most of BPF helper definitions from returning int to long. These definitions are coming from comments in BPF UAPI header and are used to generate bpf_helper_defs.h (under libbpf) to be later included and used from BPF programs. In actual in-kernel implementation, all the helpers are defined as returning u64, but due to some historical reasons, most of them are actually defined as returning int in UAPI (usually, to return 0 on success, and negative value on error). This actually causes Clang to quite often generate sub-optimal code, because compiler believes that return value is 32-bit, and in a lot of cases has to be up-converted (usually with a pair of 32-bit bit shifts) to 64-bit values, before they can be used further in BPF code. Besides just "polluting" the code, these 32-bit shifts quite often cause problems for cases in which return value matters. This is especially the case for the family of bpf_probe_read_str() functions. There are few other similar helpers (e.g., bpf_read_branch_records()), in which return value is used by BPF program logic to record variable-length data and process it. For such cases, BPF program logic carefully manages offsets within some array or map to read variable-length data. For such uses, it's crucial for BPF verifier to track possible range of register values to prove that all the accesses happen within given memory bounds. Those extraneous zero-extending bit shifts, inserted by Clang (and quite often interleaved with other code, which makes the issues even more challenging and sometimes requires employing extra per-variable compiler barriers), throws off verifier logic and makes it mark registers as having unknown variable offset. We'll study this pattern a bit later below. Another common pattern is to check return of BPF helper for non-zero state to detect error conditions and attempt alternative actions in such case. Even in this simple and straightforward case, this 32-bit vs BPF's native 64-bit mode quite often leads to sub-optimal and unnecessary extra code. We'll look at this pattern as well. Clang's BPF target supports two modes of code generation: ALU32, in which it is capable of using lower 32-bit parts of registers, and no-ALU32, in which only full 64-bit registers are being used. ALU32 mode somewhat mitigates the above described problems, but not in all cases. This patch switches all the cases in which BPF helpers return 0 or negative error from returning int to returning long. It is shown below that such change in definition leads to equivalent or better code. No-ALU32 mode benefits more, but ALU32 mode doesn't degrade or still gets improved code generation. Another class of cases switched from int to long are bpf_probe_read_str()-like helpers, which encode successful case as non-negative values, while still returning negative value for errors. In all of such cases, correctness is preserved due to two's complement encoding of negative values and the fact that all helpers return values with 32-bit absolute value. Two's complement ensures that for negative values higher 32 bits are all ones and when truncated, leave valid negative 32-bit value with the same value. Non-negative values have upper 32 bits set to zero and similarly preserve value when high 32 bits are truncated. This means that just casting to int/u32 is correct and efficient (and in ALU32 mode doesn't require any extra shifts). To minimize the chances of regressions, two code patterns were investigated, as mentioned above. For both patterns, BPF assembly was analyzed in ALU32/NO-ALU32 compiler modes, both with current 32-bit int return type and new 64-bit long return type. Case 1. Variable-length data reading and concatenation. This is quite ubiquitous pattern in tracing/monitoring applications, reading data like process's environment variables, file path, etc. In such case, many pieces of string-like variable-length data are read into a single big buffer, and at the end of the process, only a part of array containing actual data is sent to user-space for further processing. This case is tested in test_varlen.c selftest (in the next patch). Code flow is roughly as follows: void *payload = &sample->payload; u64 len; len = bpf_probe_read_kernel_str(payload, MAX_SZ1, &source_data1); if (len <= MAX_SZ1) { payload += len; sample->len1 = len; } len = bpf_probe_read_kernel_str(payload, MAX_SZ2, &source_data2); if (len <= MAX_SZ2) { payload += len; sample->len2 = len; } /* and so on */ sample->total_len = payload - &sample->payload; /* send over, e.g., perf buffer */ There could be two variations with slightly different code generated: when len is 64-bit integer and when it is 32-bit integer. Both variations were analysed. BPF assembly instructions between two successive invocations of bpf_probe_read_kernel_str() were used to check code regressions. Results are below, followed by short analysis. Left side is using helpers with int return type, the right one is after the switch to long. ALU32 + INT ALU32 + LONG =========== ============ 64-BIT (13 insns): 64-BIT (10 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: if w0 > 256 goto +9 18: if r0 > 256 goto +6 19: w1 = w0 19: r1 = 0 ll 20: r1 <<= 32 21: *(u64 *)(r1 + 0) = r0 21: r1 s>>= 32 22: r6 = 0 ll 22: r2 = 0 ll 24: r6 += r0 24: *(u64 *)(r2 + 0) = r1 00000000000000c8 : 25: r6 = 0 ll 25: r1 = r6 27: r6 += r1 26: w2 = 256 00000000000000e0 : 27: r3 = 0 ll 28: r1 = r6 29: call 115 29: w2 = 256 30: r3 = 0 ll 32: call 115 32-BIT (11 insns): 32-BIT (12 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: if w0 > 256 goto +7 18: if w0 > 256 goto +8 19: r1 = 0 ll 19: r1 = 0 ll 21: *(u32 *)(r1 + 0) = r0 21: *(u32 *)(r1 + 0) = r0 22: w1 = w0 22: r0 <<= 32 23: r6 = 0 ll 23: r0 >>= 32 25: r6 += r1 24: r6 = 0 ll 00000000000000d0 : 26: r6 += r0 26: r1 = r6 00000000000000d8 : 27: w2 = 256 27: r1 = r6 28: r3 = 0 ll 28: w2 = 256 30: call 115 29: r3 = 0 ll 31: call 115 In ALU32 mode, the variant using 64-bit length variable clearly wins and avoids unnecessary zero-extension bit shifts. In practice, this is even more important and good, because BPF code won't need to do extra checks to "prove" that payload/len are within good bounds. 32-bit len is one instruction longer. Clang decided to do 64-to-32 casting with two bit shifts, instead of equivalent `w1 = w0` assignment. The former uses extra register. The latter might potentially lose some range information, but not for 32-bit value. So in this case, verifier infers that r0 is [0, 256] after check at 18:, and shifting 32 bits left/right keeps that range intact. We should probably look into Clang's logic and see why it chooses bitshifts over sub-register assignments for this. NO-ALU32 + INT NO-ALU32 + LONG ============== =============== 64-BIT (14 insns): 64-BIT (10 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: r0 <<= 32 18: if r0 > 256 goto +6 19: r1 = r0 19: r1 = 0 ll 20: r1 >>= 32 21: *(u64 *)(r1 + 0) = r0 21: if r1 > 256 goto +7 22: r6 = 0 ll 22: r0 s>>= 32 24: r6 += r0 23: r1 = 0 ll 00000000000000c8 : 25: *(u64 *)(r1 + 0) = r0 25: r1 = r6 26: r6 = 0 ll 26: r2 = 256 28: r6 += r0 27: r3 = 0 ll 00000000000000e8 : 29: call 115 29: r1 = r6 30: r2 = 256 31: r3 = 0 ll 33: call 115 32-BIT (13 insns): 32-BIT (13 insns): ------------------------------------ ------------------------------------ 17: call 115 17: call 115 18: r1 = r0 18: r1 = r0 19: r1 <<= 32 19: r1 <<= 32 20: r1 >>= 32 20: r1 >>= 32 21: if r1 > 256 goto +6 21: if r1 > 256 goto +6 22: r2 = 0 ll 22: r2 = 0 ll 24: *(u32 *)(r2 + 0) = r0 24: *(u32 *)(r2 + 0) = r0 25: r6 = 0 ll 25: r6 = 0 ll 27: r6 += r1 27: r6 += r1 00000000000000e0 : 00000000000000e0 : 28: r1 = r6 28: r1 = r6 29: r2 = 256 29: r2 = 256 30: r3 = 0 ll 30: r3 = 0 ll 32: call 115 32: call 115 In NO-ALU32 mode, for the case of 64-bit len variable, Clang generates much superior code, as expected, eliminating unnecessary bit shifts. For 32-bit len, code is identical. So overall, only ALU-32 32-bit len case is more-or-less equivalent and the difference stems from internal Clang decision, rather than compiler lacking enough information about types. Case 2. Let's look at the simpler case of checking return result of BPF helper for errors. The code is very simple: long bla; if (bpf_probe_read_kenerl(&bla, sizeof(bla), 0)) return 1; else return 0; ALU32 + CHECK (9 insns) ALU32 + CHECK (9 insns) ==================================== ==================================== 0: r1 = r10 0: r1 = r10 1: r1 += -8 1: r1 += -8 2: w2 = 8 2: w2 = 8 3: r3 = 0 3: r3 = 0 4: call 113 4: call 113 5: w1 = w0 5: r1 = r0 6: w0 = 1 6: w0 = 1 7: if w1 != 0 goto +1 7: if r1 != 0 goto +1 8: w0 = 0 8: w0 = 0 0000000000000048 : 0000000000000048 : 9: exit 9: exit Almost identical code, the only difference is the use of full register assignment (r1 = r0) vs half-registers (w1 = w0) in instruction #5. On 32-bit architectures, new BPF assembly might be slightly less optimal, in theory. But one can argue that's not a big issue, given that use of full registers is still prevalent (e.g., for parameter passing). NO-ALU32 + CHECK (11 insns) NO-ALU32 + CHECK (9 insns) ==================================== ==================================== 0: r1 = r10 0: r1 = r10 1: r1 += -8 1: r1 += -8 2: r2 = 8 2: r2 = 8 3: r3 = 0 3: r3 = 0 4: call 113 4: call 113 5: r1 = r0 5: r1 = r0 6: r1 <<= 32 6: r0 = 1 7: r1 >>= 32 7: if r1 != 0 goto +1 8: r0 = 1 8: r0 = 0 9: if r1 != 0 goto +1 0000000000000048 : 10: r0 = 0 9: exit 0000000000000058 : 11: exit NO-ALU32 is a clear improvement, getting rid of unnecessary zero-extension bit shifts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200623032224.4020118-1-andriin@fb.com --- include/uapi/linux/bpf.h | 192 ++++++++++++++++++++--------------------- tools/include/uapi/linux/bpf.h | 192 ++++++++++++++++++++--------------------- 2 files changed, 192 insertions(+), 192 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 19684813faae..be0efee49093 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -653,7 +653,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +671,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +695,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -775,7 +775,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +792,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +849,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +880,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +916,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +953,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +969,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +981,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1032,7 +1032,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1098,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1145,7 +1145,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1190,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1207,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1276,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1294,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1304,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1331,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1358,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1389,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1408,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1420,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1444,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1500,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1532,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1547,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1595,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1630,7 +1630,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -1676,7 +1676,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1697,7 +1697,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1708,7 +1708,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1727,7 +1727,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1756,7 +1756,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1806,7 +1806,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1817,7 +1817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1842,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1867,7 +1867,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1911,7 +1911,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1925,7 +1925,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1959,7 +1959,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1977,7 +1977,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2008,7 +2008,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2026,7 +2026,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2040,7 +2040,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2056,7 +2056,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2089,7 +2089,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2111,7 +2111,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2142,7 +2142,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2161,7 +2161,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2175,7 +2175,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2189,7 +2189,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2226,7 +2226,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2241,7 +2241,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2257,7 +2257,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2286,7 +2286,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2305,7 +2305,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2370,7 +2370,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2471,7 +2471,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2479,7 +2479,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2489,19 +2489,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2517,7 +2517,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2529,7 +2529,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2543,7 +2543,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2591,7 +2591,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2614,7 +2614,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2651,7 +2651,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2666,7 +2666,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2682,7 +2682,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2701,7 +2701,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2718,7 +2718,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2735,7 +2735,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2759,7 +2759,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2810,7 +2810,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2818,7 +2818,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2859,7 +2859,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2883,21 +2883,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2941,7 +2941,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2949,14 +2949,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2976,7 +2976,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2995,7 +2995,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3007,7 +3007,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3062,7 +3062,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, @@ -3097,7 +3097,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3126,7 +3126,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3221,7 +3221,7 @@ union bpf_attr { * Return * Requested value, or 0, if flags are not recognized. * - * int bpf_csum_level(struct sk_buff *skb, u64 level) + * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 19684813faae..be0efee49093 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -653,7 +653,7 @@ union bpf_attr { * Map value associated to *key*, or **NULL** if no entry was * found. * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) * Description * Add or update the value of the entry associated to *key* in * *map* with *value*. *flags* is one of: @@ -671,13 +671,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) * Description * Delete entry with *key* from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from * kernel space address *unsafe_ptr* and store the data in *dst*. @@ -695,7 +695,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) * Description * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) @@ -775,7 +775,7 @@ union bpf_attr { * Return * The SMP id of the processor running the program. * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. *flags* are a combination of @@ -792,7 +792,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) * Description * Recompute the layer 3 (e.g. IP) checksum for the packet * associated to *skb*. Computation is incremental, so the helper @@ -817,7 +817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) * Description * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the * packet associated to *skb*. Computation is incremental, so the @@ -849,7 +849,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) * Description * This special helper is used to trigger a "tail call", or in * other words, to jump into another eBPF program. The same stack @@ -880,7 +880,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) * Description * Clone and redirect the packet associated to *skb* to another * net device of index *ifindex*. Both ingress and egress @@ -916,7 +916,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(void *buf, u32 size_of_buf) + * long bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -953,7 +953,7 @@ union bpf_attr { * Return * The classid, or 0 for the default unconfigured classid. * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) * Description * Push a *vlan_tci* (VLAN tag control information) of protocol * *vlan_proto* to the packet associated to *skb*, then update @@ -969,7 +969,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_vlan_pop(struct sk_buff *skb) + * long bpf_skb_vlan_pop(struct sk_buff *skb) * Description * Pop a VLAN header from the packet associated to *skb*. * @@ -981,7 +981,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Get tunnel metadata. This helper takes a pointer *key* to an * empty **struct bpf_tunnel_key** of **size**, that will be @@ -1032,7 +1032,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) * Description * Populate tunnel metadata for packet associated to *skb.* The * tunnel metadata is set to the contents of *key*, of *size*. The @@ -1098,7 +1098,7 @@ union bpf_attr { * The value of the perf event counter read from the map, or a * negative error code in case of failure. * - * int bpf_redirect(u32 ifindex, u64 flags) + * long bpf_redirect(u32 ifindex, u64 flags) * Description * Redirect the packet to another net device of index *ifindex*. * This helper is somewhat similar to **bpf_clone_redirect**\ @@ -1145,7 +1145,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1190,7 +1190,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1207,7 +1207,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1276,7 +1276,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1294,7 +1294,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1304,7 +1304,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) * Description * Change the protocol of the *skb* to *proto*. Currently * supported are transition from IPv4 to IPv6, and from IPv6 to @@ -1331,7 +1331,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) * Description * Change the packet type for the packet associated to *skb*. This * comes down to setting *skb*\ **->pkt_type** to *type*, except @@ -1358,7 +1358,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) * Description * Check whether *skb* is a descendant of the cgroup2 held by * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. @@ -1389,7 +1389,7 @@ union bpf_attr { * Return * A pointer to the current task struct. * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) + * long bpf_probe_write_user(void *dst, const void *src, u32 len) * Description * Attempt in a safe way to write *len* bytes from the buffer * *src* to *dst* in memory. It only works for threads that are in @@ -1408,7 +1408,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) * Description * Check whether the probe is being run is the context of a given * subset of the cgroup2 hierarchy. The cgroup2 to test is held by @@ -1420,7 +1420,7 @@ union bpf_attr { * * 1, if the *skb* task does not belong to the cgroup2. * * A negative error code, if an error occurred. * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) * Description * Resize (trim or grow) the packet associated to *skb* to the * new *len*. The *flags* are reserved for future usage, and must @@ -1444,7 +1444,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) * Description * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes @@ -1500,7 +1500,7 @@ union bpf_attr { * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. * - * int bpf_get_numa_node_id(void) + * long bpf_get_numa_node_id(void) * Description * Return the id of the current NUMA node. The primary use case * for this helper is the selection of sockets for the local NUMA @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * The id of current NUMA node. * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) * Description * Grows headroom of packet associated to *skb* and adjusts the * offset of the MAC header accordingly, adding *len* bytes of @@ -1532,7 +1532,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that * it is possible to use a negative value for *delta*. This helper @@ -1547,7 +1547,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for @@ -1595,14 +1595,14 @@ union bpf_attr { * is returned (note that **overflowuid** might also be the actual * UID value for the socket). * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) + * long bpf_set_hash(struct sk_buff *skb, u32 hash) * Description * Set the full hash for *skb* (set the field *skb*\ **->hash**) * to value *hash*. * Return * 0 * - * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1630,7 +1630,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. @@ -1676,7 +1676,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain @@ -1697,7 +1697,7 @@ union bpf_attr { * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. * - * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1708,7 +1708,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a *map* referencing sockets. The * *skops* is used as a new value for the entry associated to @@ -1727,7 +1727,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) * Description * Adjust the address pointed by *xdp_md*\ **->data_meta** by * *delta* (which can be positive or negative). Note that this @@ -1756,7 +1756,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) * Description * Read the value of a perf event counter, and store it into *buf* * of size *buf_size*. This helper relies on a *map* of type @@ -1806,7 +1806,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) * Description * For en eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in @@ -1817,7 +1817,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1842,7 +1842,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_override_return(struct pt_regs *regs, u64 rc) + * long bpf_override_return(struct pt_regs *regs, u64 rc) * Description * Used for error injection, this helper uses kprobes to override * the return value of the probed function, and to set it to *rc*. @@ -1867,7 +1867,7 @@ union bpf_attr { * Return * 0 * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) * Description * Attempt to set the value of the **bpf_sock_ops_cb_flags** field * for the full TCP socket associated to *bpf_sock_ops* to @@ -1911,7 +1911,7 @@ union bpf_attr { * be set is returned (which comes down to 0 if all bits were set * as required). * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -1925,7 +1925,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, apply the verdict of the eBPF program to * the next *bytes* (number of bytes) of message *msg*. @@ -1959,7 +1959,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) * Description * For socket policies, prevent the execution of the verdict eBPF * program for message *msg* until *bytes* (byte number) have been @@ -1977,7 +1977,7 @@ union bpf_attr { * Return * 0 * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) * Description * For socket policies, pull in non-linear data from user space * for *msg* and set pointers *msg*\ **->data** and *msg*\ @@ -2008,7 +2008,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) * Description * Bind the socket associated to *ctx* to the address pointed by * *addr*, of length *addr_len*. This allows for making outgoing @@ -2026,7 +2026,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is * possible to both shrink and grow the packet tail. @@ -2040,7 +2040,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) * Description * Retrieve the XFRM state (IP transform framework, see also * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. @@ -2056,7 +2056,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -2089,7 +2089,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2111,7 +2111,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) * Description * Do FIB lookup in kernel tables using parameters in *params*. * If lookup is successful and result shows packet is to be @@ -2142,7 +2142,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2161,7 +2161,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * socket level. If the message *msg* is allowed to pass (i.e. if @@ -2175,7 +2175,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. @@ -2189,7 +2189,7 @@ union bpf_attr { * Return * **SK_PASS** on success, or **SK_DROP** on error. * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) * Description * Encapsulate the packet associated to *skb* within a Layer 3 * protocol header. This header is provided in the buffer at @@ -2226,7 +2226,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) * Description * Store *len* bytes from address *from* into the packet * associated to *skb*, at *offset*. Only the flags, tag and TLVs @@ -2241,7 +2241,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) * Description * Adjust the size allocated to TLVs in the outermost IPv6 * Segment Routing Header contained in the packet associated to @@ -2257,7 +2257,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) * Description * Apply an IPv6 Segment Routing action of type *action* to the * packet associated to *skb*. Each action takes a parameter @@ -2286,7 +2286,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_repeat(void *ctx) + * long bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded repeat key message. This delays @@ -2305,7 +2305,7 @@ union bpf_attr { * Return * 0 * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded key press with *scancode*, @@ -2370,7 +2370,7 @@ union bpf_attr { * Return * A pointer to the local storage area. * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. @@ -2471,7 +2471,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(struct bpf_sock *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -2479,7 +2479,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) * Description * Push an element *value* in *map*. *flags* is one of: * @@ -2489,19 +2489,19 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * long bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * long bpf_map_peek_elem(struct bpf_map *map, void *value) * Description * Get an element from *map* without removing it. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2517,7 +2517,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if @@ -2529,7 +2529,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) * Description * This helper is used in programs implementing IR decoding, to * report a successfully decoded pointer movement. @@ -2543,7 +2543,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_lock(struct bpf_spin_lock *lock) + * long bpf_spin_lock(struct bpf_spin_lock *lock) * Description * Acquire a spinlock represented by the pointer *lock*, which is * stored as part of a value of a map. Taking the lock allows to @@ -2591,7 +2591,7 @@ union bpf_attr { * Return * 0 * - * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * long bpf_spin_unlock(struct bpf_spin_lock *lock) * Description * Release the *lock* previously locked by a call to * **bpf_spin_lock**\ (\ *lock*\ ). @@ -2614,7 +2614,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buff *skb) + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** @@ -2651,7 +2651,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2666,7 +2666,7 @@ union bpf_attr { * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. * - * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description * Get name of sysctl in /proc/sys/ and copy it into provided by * program buffer *buf* of size *buf_len*. @@ -2682,7 +2682,7 @@ union bpf_attr { * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). * - * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get current value of sysctl as it is presented in /proc/sys * (incl. newline, etc), and copy it as a string into provided @@ -2701,7 +2701,7 @@ union bpf_attr { * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. * - * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) * Description * Get new value being written by user space to sysctl (before * the actual write happens) and copy it as a string into @@ -2718,7 +2718,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) * Description * Override new value being written by user space to sysctl with * value provided by program in buffer *buf* of size *buf_len*. @@ -2735,7 +2735,7 @@ union bpf_attr { * * **-EINVAL** if sysctl is being read. * - * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to a long integer according to the given base @@ -2759,7 +2759,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) * Description * Convert the initial part of the string from buffer *buf* of * size *buf_len* to an unsigned long integer according to the @@ -2810,7 +2810,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return @@ -2818,7 +2818,7 @@ union bpf_attr { * * **-ENOENT** if the bpf-local-storage cannot be found. * - * int bpf_send_signal(u32 sig) + * long bpf_send_signal(u32 sig) * Description * Send signal *sig* to the process of the current task. * The signal may be delivered to any of this process's threads. @@ -2859,7 +2859,7 @@ union bpf_attr { * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 * - * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -2883,21 +2883,21 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from user space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * Description * Safely attempt to read *size* bytes from kernel space address * *unsafe_ptr* and store the data in *dst*. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe user address * *unsafe_ptr* to *dst*. The *size* should include the @@ -2941,7 +2941,7 @@ union bpf_attr { * including the trailing NUL character. On error, a negative * value. * - * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) * Description * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. @@ -2949,14 +2949,14 @@ union bpf_attr { * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. * - * int bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) * Description * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. * *rcv_nxt* is the ack_seq to be sent out. * Return * 0 on success, or a negative error in case of failure. * - * int bpf_send_signal_thread(u32 sig) + * long bpf_send_signal_thread(u32 sig) * Description * Send signal *sig* to the thread corresponding to the current task. * Return @@ -2976,7 +2976,7 @@ union bpf_attr { * Return * The 64 bit jiffies * - * int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) * Description * For an eBPF program attached to a perf event, retrieve the * branch records (**struct perf_branch_entry**) associated to *ctx* @@ -2995,7 +2995,7 @@ union bpf_attr { * * **-ENOENT** if architecture does not support branch records. * - * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) * Description * Returns 0 on success, values for *pid* and *tgid* as seen from the current * *namespace* will be returned in *nsdata*. @@ -3007,7 +3007,7 @@ union bpf_attr { * * **-ENOENT** if pidns does not exists for the current task. * - * int bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -3062,7 +3062,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, @@ -3097,7 +3097,7 @@ union bpf_attr { * Return * Current *ktime*. * - * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) * Description * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print * out the format string. @@ -3126,7 +3126,7 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * int bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) * Description * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. * The *m* represents the seq_file. The *data* and *len* represent the @@ -3221,7 +3221,7 @@ union bpf_attr { * Return * Requested value, or 0, if flags are not recognized. * - * int bpf_csum_level(struct sk_buff *skb, u64 level) + * long bpf_csum_level(struct sk_buff *skb, u64 level) * Description * Change the skbs checksum level by one layer up or down, or * reset it entirely to none in order to have the stack perform -- cgit v1.2.3 From 428d2459cceb77357b81c242ca22462a6a904817 Mon Sep 17 00:00:00 2001 From: Petr Vaněk Date: Sat, 30 May 2020 14:39:12 +0200 Subject: xfrm: introduce oseq-may-wrap flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC 4303 in section 3.3.3 suggests to disable anti-replay for manually distributed ICVs in which case the sender does not need to monitor or reset the counter. However, the sender still increments the counter and when it reaches the maximum value, the counter rolls over back to zero. This patch introduces new extra_flag XFRM_SA_XFLAG_OSEQ_MAY_WRAP which allows sequence number to cycle in outbound packets if set. This flag is used only in legacy and bmp code, because esn should not be negotiated if anti-replay is disabled (see note in 3.3.3 section). Signed-off-by: Petr Vaněk Acked-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 1 + net/xfrm/xfrm_replay.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index ff7cfdc6cb44..ffc6a5391bb7 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -387,6 +387,7 @@ struct xfrm_usersa_info { }; #define XFRM_SA_XFLAG_DONT_ENCAP_DSCP 1 +#define XFRM_SA_XFLAG_OSEQ_MAY_WRAP 2 struct xfrm_usersa_id { xfrm_address_t daddr; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 98943f8d01aa..c6a4338a0d08 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -89,7 +89,8 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(x->replay.oseq == 0)) { + if (unlikely(x->replay.oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { x->replay.oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -168,7 +169,8 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, struct sk_buff *skb) if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq; XFRM_SKB_CB(skb)->seq.output.hi = 0; - if (unlikely(replay_esn->oseq == 0)) { + if (unlikely(replay_esn->oseq == 0) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { replay_esn->oseq--; xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -572,7 +574,8 @@ static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *sk XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < x->replay.oseq)) { + if (unlikely(oseq < x->replay.oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; @@ -611,7 +614,8 @@ static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff XFRM_SKB_CB(skb)->seq.output.hi = 0; xo->seq.hi = 0; - if (unlikely(oseq < replay_esn->oseq)) { + if (unlikely(oseq < replay_esn->oseq) && + !(x->props.extra_flags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP)) { xfrm_audit_state_replay_overflow(x, skb); err = -EOVERFLOW; -- cgit v1.2.3 From f9bcf96837f158db6ea982d15cd2c8161ca6bc23 Mon Sep 17 00:00:00 2001 From: Dmitry Yakunin Date: Sat, 20 Jun 2020 18:30:52 +0300 Subject: bpf: Add SO_KEEPALIVE and related options to bpf_setsockopt This patch adds support of SO_KEEPALIVE flag and TCP related options to bpf_setsockopt() routine. This is helpful if we want to enable or tune TCP keepalive for applications which don't do it in the userspace code. v3: - update kernel-doc in uapi (Nikita Vetoshkin ) v4: - update kernel-doc in tools too (Alexei Starovoitov) - add test to selftests (Alexei Starovoitov) Signed-off-by: Dmitry Yakunin Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200620153052.9439-3-zeil@yandex-team.ru --- include/uapi/linux/bpf.h | 7 +++-- net/core/filter.c | 36 ++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 7 +++-- tools/testing/selftests/bpf/progs/connect4_prog.c | 27 +++++++++++++++++ 4 files changed, 72 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9d3923e6b860..d9737d51dd19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1621,10 +1621,13 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..c713b6b8938f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4289,10 +4289,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { char devname[IFNAMSIZ]; + int val, valbool; struct net *net; int ifindex; int ret = 0; - int val; if (!sk_fullsock(sk)) return -EINVAL; @@ -4303,6 +4303,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); + valbool = val ? 1 : 0; /* Only some socketops are supported */ switch (optname) { @@ -4361,6 +4362,11 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } ret = sock_bindtoindex(sk, ifindex, false); break; + case SO_KEEPALIVE: + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; default: ret = -EINVAL; } @@ -4421,6 +4427,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, ret = tcp_set_congestion_control(sk, name, false, reinit, true); } else { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (optlen != sizeof(int)) @@ -4449,6 +4456,33 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else tp->save_syn = val; break; + case TCP_KEEPIDLE: + ret = tcp_sock_set_keepidle_locked(sk, val); + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + ret = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + ret = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + ret = -EINVAL; + else + icsk->icsk_syn_retries = val; + break; + case TCP_USER_TIMEOUT: + if (val < 0) + ret = -EINVAL; + else + icsk->icsk_user_timeout = val; + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9d3923e6b860..d9737d51dd19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1621,10 +1621,13 @@ union bpf_attr { * * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index 1ab2c5eba86c..b1b2773c0b9d 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -104,6 +104,30 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_keepalive(struct bpf_sock_addr *ctx) +{ + int zero = 0, one = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &one, sizeof(one))) + return 1; + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPIDLE, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPINTVL, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPCNT, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_SYNCNT, &one, sizeof(one))) + return 1; + if (bpf_setsockopt(ctx, SOL_TCP, TCP_USER_TIMEOUT, &one, sizeof(one))) + return 1; + } + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &zero, sizeof(zero))) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -121,6 +145,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (bind_to_device(ctx)) return 0; + if (set_keepalive(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3 From 899426b3bdd947541ba4af8c767575889c8b842a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:16 +0300 Subject: net: neighbor: add fdb extended attribute Add an attribute to NDA which will contain all future fdb-specific attributes in order to avoid polluting the NDA namespace with e.g. bridge or vxlan specific attributes. The attribute is called NDA_FDB_EXT_ATTRS and the structure would look like: [NDA_FDB_EXT_ATTRS] = { [NFEA_xxx] } Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 12 ++++++++++++ net/core/neighbour.c | 1 + 2 files changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index eefcda8ca44e..540ff48402a1 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -30,6 +30,7 @@ enum { NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ NDA_NH_ID, + NDA_FDB_EXT_ATTRS, __NDA_MAX }; @@ -172,4 +173,15 @@ enum { }; #define NDTA_MAX (__NDTA_MAX - 1) +/* embedded into NDA_FDB_EXT_ATTRS: + * [NDA_FDB_EXT_ATTRS] = { + * ... + * } + */ +enum { + NFEA_UNSPEC, + __NFEA_MAX +}; +#define NFEA_MAX (__NFEA_MAX - 1) + #endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ef6b5a8f629c..8e39e28b0a8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1783,6 +1783,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From 31cbc39b6344916c20452e43a9171009214c409c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:17 +0300 Subject: net: bridge: add option to allow activity notifications for any fdb entries This patch adds the ability to notify about activity of any entries (static, permanent or ext_learn). EVPN multihoming peers need it to properly and efficiently handle mac sync (peer active/locally active). We add a new NFEA_ACTIVITY_NOTIFY attribute which is used to dump the current activity state and to control if static entries should be monitored at all. We use 2 bits - one to activate fdb entry tracking (disabled by default) and the second to denote that an entry is inactive. We need the second bit in order to avoid multiple notifications of inactivity. Obviously this makes no difference for dynamic entries since at the time of inactivity they get deleted, while the tracked non-dynamic entries get the inactive bit set and get a notification. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 11 ++++ net/bridge/br_fdb.c | 117 ++++++++++++++++++++++++++++++++++++----- net/bridge/br_private.h | 4 ++ 3 files changed, 119 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 540ff48402a1..21e569297355 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -173,13 +173,24 @@ enum { }; #define NDTA_MAX (__NDTA_MAX - 1) + /* FDB activity notification bits used in NFEA_ACTIVITY_NOTIFY: + * - FDB_NOTIFY_BIT - notify on activity/expire for any entry + * - FDB_NOTIFY_INACTIVE_BIT - mark as inactive to avoid multiple notifications + */ +enum { + FDB_NOTIFY_BIT = (1 << 0), + FDB_NOTIFY_INACTIVE_BIT = (1 << 1) +}; + /* embedded into NDA_FDB_EXT_ATTRS: * [NDA_FDB_EXT_ATTRS] = { + * [NFEA_ACTIVITY_NOTIFY] * ... * } */ enum { NFEA_UNSPEC, + NFEA_ACTIVITY_NOTIFY, __NFEA_MAX }; #define NFEA_MAX (__NFEA_MAX - 1) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index ed80d9ab0fb9..642deb57c064 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -349,12 +349,21 @@ void br_fdb_cleanup(struct work_struct *work) */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - unsigned long this_timer; + unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || - test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) + test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { + if (test_bit(BR_FDB_NOTIFY, &f->flags)) { + if (time_after(this_timer, now)) + work_delay = min(work_delay, + this_timer - now); + else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, + &f->flags)) + fdb_notify(br, f, RTM_NEWNEIGH, false); + } continue; - this_timer = f->updated + delay; + } + if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { @@ -556,11 +565,17 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return ret; } +/* returns true if the fdb was modified */ +static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) +{ + return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && + test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); +} + void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; - bool fdb_modified = false; /* some users want to always flood. */ if (hold_time(br) == 0) @@ -575,6 +590,12 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->dev->name, addr, vid); } else { unsigned long now = jiffies; + bool fdb_modified = false; + + if (now != fdb->updated) { + fdb->updated = now; + fdb_modified = __fdb_mark_active(fdb); + } /* fastpath: update of existing entry */ if (unlikely(source != fdb->dst && @@ -587,8 +608,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); } - if (now != fdb->updated) - fdb->updated = now; + if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (unlikely(fdb_modified)) { @@ -667,6 +687,23 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, &fdb->key.vlan_id)) goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { + struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); + u8 notify_bits = FDB_NOTIFY_BIT; + + if (!nest) + goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + notify_bits |= FDB_NOTIFY_INACTIVE_BIT; + + if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { + nla_nest_cancel(skb, nest); + goto nla_put_failure; + } + + nla_nest_end(skb, nest); + } + nlmsg_end(skb, nlh); return 0; @@ -681,7 +718,9 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ - + nla_total_size(sizeof(struct nda_cacheinfo)); + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, @@ -791,14 +830,40 @@ errout: return err; } +/* returns true if the fdb is modified */ +static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) +{ + bool modified = false; + + /* allow to mark an entry as inactive, usually done on creation */ + if ((notify & FDB_NOTIFY_INACTIVE_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + modified = true; + + if ((notify & FDB_NOTIFY_BIT) && + !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* enabled activity tracking */ + modified = true; + } else if (!(notify & FDB_NOTIFY_BIT) && + test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { + /* disabled activity tracking, clear notify state */ + clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); + modified = true; + } + + return modified; +} + /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid) + const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, + struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; + u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && @@ -815,6 +880,13 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; + if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { + notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); + if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || + (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) + return -EINVAL; + } + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -858,6 +930,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (fdb_handle_notify(fdb, notify)) + modified = true; + set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); fdb->used = jiffies; @@ -871,7 +946,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, - u16 nlh_flags, u16 vid) + u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[]) { int err = 0; @@ -893,19 +968,24 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); - err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid); + err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } return err; } +static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { + [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, +}; + /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, struct netlink_ext_ack *extack) { + struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; @@ -938,6 +1018,16 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FDB_EXT_ATTRS]) { + attr = tb[NDA_FDB_EXT_ATTRS]; + err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, + br_nda_fdb_pol, extack); + if (err) + return err; + } else { + memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); + } + if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { @@ -946,9 +1036,9 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], } /* VID was specified, so use it. */ - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb); } else { - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb); if (err || !vg || !vg->num_vlans) goto out; @@ -959,7 +1049,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid); + err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, + nfea_tb); if (err) goto out; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7501be4eeba0..c0ae639e1b36 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -48,6 +48,8 @@ enum { /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" +#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; @@ -184,6 +186,8 @@ enum { BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, + BR_FDB_NOTIFY, + BR_FDB_NOTIFY_INACTIVE }; struct net_bridge_fdb_key { -- cgit v1.2.3 From b5f1d9ec283bd28a452cf61d7e5c2f2b1a9cccda Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 23 Jun 2020 23:47:18 +0300 Subject: net: bridge: add a flag to avoid refreshing fdb when changing/adding When we modify or create a new fdb entry sometimes we want to avoid refreshing its activity in order to track it properly. One example is when a mac is received from EVPN multi-homing peer by FRR, which doesn't want to change local activity accounting. It makes it static and sets a flag to track its activity. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 1 + net/bridge/br_fdb.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 21e569297355..dc8b72201f6c 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -191,6 +191,7 @@ enum { enum { NFEA_UNSPEC, NFEA_ACTIVITY_NOTIFY, + NFEA_DONT_REFRESH, __NFEA_MAX }; #define NFEA_MAX (__NFEA_MAX - 1) diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 642deb57c064..9db504baa094 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -860,6 +860,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); + bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; @@ -937,7 +938,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, fdb->used = jiffies; if (modified) { - fdb->updated = jiffies; + if (refresh) + fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } @@ -977,6 +979,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, + [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, }; /* Add new permanent fdb entry with RTM_NEWNEIGH */ -- cgit v1.2.3 From af7ec13833619e17f03aa73a785a2f871da6d66b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:09 -0700 Subject: bpf: Add bpf_skc_to_tcp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a tcp6_sock pointer. The return value could be NULL if the casting is illegal. A new helper return type RET_PTR_TO_BTF_ID_OR_NULL is added so the verifier is able to deduce proper return types for the helper. Different from the previous BTF_ID based helpers, the bpf_skc_to_tcp6_sock() argument can be several possible btf_ids. More specifically, all possible socket data structures with sock_common appearing in the first in the memory layout. This patch only added socket types related to tcp and udp. All possible argument btf_id and return value btf_id for helper bpf_skc_to_tcp6_sock() are pre-calculcated and cached. In the future, it is even possible to precompute these btf_id's at kernel build time. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230809.3988195-1-yhs@fb.com --- include/linux/bpf.h | 12 +++++++ include/uapi/linux/bpf.h | 9 ++++- kernel/bpf/btf.c | 1 + kernel/bpf/verifier.c | 43 ++++++++++++++++------ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 82 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++- 8 files changed, 148 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1e1501ee53ce..c0e38ad07848 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -265,6 +265,7 @@ enum bpf_return_type { RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ + RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -287,6 +288,12 @@ struct bpf_func_proto { enum bpf_arg_type arg_type[5]; }; int *btf_id; /* BTF ids of arguments */ + bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is + * valid. Often used if more + * than one btf id is permitted + * for this argument. + */ + int *ret_btf_id; /* return value btf_id */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -1524,6 +1531,7 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +void init_btf_sock_ids(struct btf *btf); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -1549,6 +1557,9 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline void init_btf_sock_ids(struct btf *btf) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) @@ -1638,6 +1649,7 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e377d1981730..4c3007f428b1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3674,6 +3674,7 @@ struct btf *btf_parse_vmlinux(void) goto errout; bpf_struct_ops_init(btf, log); + init_btf_sock_ids(btf); btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7460f967cb75..7de98906ddf4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3800,12 +3800,14 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } -static int check_func_arg(struct bpf_verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, - struct bpf_call_arg_meta *meta) +static int check_func_arg(struct bpf_verifier_env *env, u32 arg, + struct bpf_call_arg_meta *meta, + const struct bpf_func_proto *fn) { + u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; + enum bpf_arg_type arg_type = fn->arg_type[arg]; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3885,9 +3887,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_BTF_ID; if (type != expected_type) goto err_type; - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), + if (!fn->check_btf_id) { + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + } else if (!fn->check_btf_id(reg->btf_id, arg)) { + verbose(env, "Helper does not support %s in R%d\n", kernel_type_name(reg->btf_id), regno); return -EACCES; @@ -4709,10 +4718,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (!fn->check_btf_id) { + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; + } + err = check_func_arg(env, i, &meta, fn); if (err) return err; } @@ -4815,6 +4826,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + int ret_btf_id; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + ret_btf_id = *fn->ret_btf_id; + if (ret_btf_id == 0) { + verbose(env, "invalid return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); + return -EINVAL; + } + regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0159f12d2af5..2a97a268f533 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1515,6 +1515,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index c713b6b8938f..176e27d75c51 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -9225,3 +9226,84 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } + +/* Define a list of socket types which can be the argument for + * skc_to_*_sock() helpers. All these sockets should have + * sock_common as the first argument in its memory layout. + */ +#define BTF_SOCK_TYPE_xxx \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock") \ + BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock") + +enum { +#define BTF_SOCK_TYPE(name, str) name, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +MAX_BTF_SOCK_TYPE, +}; + +static int btf_sock_ids[MAX_BTF_SOCK_TYPE]; + +#ifdef CONFIG_BPF_SYSCALL +static const char *bpf_sock_types[] = { +#define BTF_SOCK_TYPE(name, str) str, +BTF_SOCK_TYPE_xxx +#undef BTF_SOCK_TYPE +}; + +void init_btf_sock_ids(struct btf *btf) +{ + int i, btf_id; + + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) { + btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i], + BTF_KIND_STRUCT); + if (btf_id > 0) + btf_sock_ids[i] = btf_id; + } +} +#endif + +static bool check_arg_btf_id(u32 btf_id, u32 arg) +{ + int i; + + /* only one argument, no need to check arg */ + for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) + if (btf_sock_ids[i] == btf_id) + return true; + return false; +} + +BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) +{ + /* tcp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct tcp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && + sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { + .func = bpf_skc_to_tcp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 91fa668fa860..6c2f64118651 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -421,6 +421,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -458,6 +459,7 @@ class PrinterHelpers(Printer): 'struct sockaddr', 'struct tcphdr', 'struct seq_file', + 'struct tcp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d9737d51dd19..e90ad07b291a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3255,6 +3255,12 @@ union bpf_attr { * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3392,7 +3398,8 @@ union bpf_attr { FN(ringbuf_submit), \ FN(ringbuf_discard), \ FN(ringbuf_query), \ - FN(csum_level), + FN(csum_level), \ + FN(skc_to_tcp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 478cfbdf5f13dfe09cfd0b1cbac821f5e27f6108 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:11 -0700 Subject: bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers Three more helpers are added to cast a sock_common pointer to an tcp_sock, tcp_timewait_sock or a tcp_request_sock for tracing programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200623230811.3988277-1-yhs@fb.com --- include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 23 +++++++++++++++- kernel/trace/bpf_trace.c | 6 ++++ net/core/filter.c | 62 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 6 ++++ tools/include/uapi/linux/bpf.h | 23 +++++++++++++++- 6 files changed, 121 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c0e38ad07848..c23998cf6699 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1650,6 +1650,9 @@ extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2a97a268f533..48d935b0d87c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1517,6 +1517,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 176e27d75c51..0b4e5aed7e20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -74,6 +74,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -9307,3 +9308,64 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; + +BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { + .func = bpf_skc_to_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +}; + +BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { + .func = bpf_skc_to_tcp_timewait_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], +}; + +BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) +{ + if (sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; + +#if IS_BUILTIN(CONFIG_IPV6) + if (sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) + return (unsigned long)sk; +#endif + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { + .func = bpf_skc_to_tcp_request_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6c2f64118651..d886657c6aaa 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -422,6 +422,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -460,6 +463,9 @@ class PrinterHelpers(Printer): 'struct tcphdr', 'struct seq_file', 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e90ad07b291a..b9412ab275f3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3261,6 +3261,24 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3399,7 +3417,10 @@ union bpf_attr { FN(ringbuf_discard), \ FN(ringbuf_query), \ FN(csum_level), \ - FN(skc_to_tcp6_sock), + FN(skc_to_tcp6_sock), \ + FN(skc_to_tcp_sock), \ + FN(skc_to_tcp_timewait_sock), \ + FN(skc_to_tcp_request_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 0d4fad3e57df2bf61e8ffc8d12a34b1caf9b8835 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jun 2020 16:08:15 -0700 Subject: bpf: Add bpf_skc_to_udp6_sock() helper The helper is used in tracing programs to cast a socket pointer to a udp6_sock pointer. The return value could be NULL if the casting is illegal. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/20200623230815.3988481-1-yhs@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 22 ++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 9 ++++++++- 6 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c23998cf6699..3d2ade703a35 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1653,6 +1653,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; +extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 48d935b0d87c..5d59dda5f661 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1523,6 +1523,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 0b4e5aed7e20..c796e141ea8e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9369,3 +9369,25 @@ const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .check_btf_id = check_arg_btf_id, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; + +BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) +{ + /* udp6_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct udp6_sock); + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && + sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { + .func = bpf_skc_to_udp6_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID, + .check_btf_id = check_arg_btf_id, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], +}; diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index d886657c6aaa..6bab40ff442e 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -425,6 +425,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', 'struct __sk_buff', 'struct sk_msg_md', @@ -466,6 +467,7 @@ class PrinterHelpers(Printer): 'struct tcp_sock', 'struct tcp_timewait_sock', 'struct tcp_request_sock', + 'struct udp6_sock', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9412ab275f3..0cb8ec948816 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3279,6 +3279,12 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or NULL otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3420,7 +3426,8 @@ union bpf_attr { FN(skc_to_tcp6_sock), \ FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ - FN(skc_to_tcp_request_sock), + FN(skc_to_tcp_request_sock), \ + FN(skc_to_udp6_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From bccb48c89fe3c09f1cbb7c8612e31f5daa1d4541 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Jun 2020 20:13:21 +0200 Subject: batman-adv: Fix typos and grammar in documentation Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- Documentation/networking/batman-adv.rst | 6 ++-- include/uapi/linux/batadv_packet.h | 50 ++++++++++++++++----------------- include/uapi/linux/batman_adv.h | 4 +-- net/batman-adv/bat_iv_ogm.c | 8 +++--- net/batman-adv/bat_v_elp.c | 10 +++---- net/batman-adv/bat_v_ogm.c | 14 ++++----- net/batman-adv/bridge_loop_avoidance.c | 6 ++-- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/fragmentation.c | 6 ++-- net/batman-adv/hard-interface.c | 14 ++++----- net/batman-adv/log.h | 6 ++-- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 6 ++-- net/batman-adv/multicast.c | 21 +++++++------- net/batman-adv/netlink.c | 2 +- net/batman-adv/network-coding.c | 14 ++++----- net/batman-adv/originator.c | 8 +++--- net/batman-adv/routing.c | 4 +-- net/batman-adv/send.c | 4 +-- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/tp_meter.c | 12 ++++---- net/batman-adv/translation-table.c | 10 +++---- net/batman-adv/tvlv.c | 4 +-- net/batman-adv/types.h | 12 ++++---- 24 files changed, 114 insertions(+), 113 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/batman-adv.rst b/Documentation/networking/batman-adv.rst index 02af49b08635..74821d29a22f 100644 --- a/Documentation/networking/batman-adv.rst +++ b/Documentation/networking/batman-adv.rst @@ -73,7 +73,7 @@ lower value. This will make the mesh more responsive to topology changes, but will also increase the overhead. Information about the current state can be accessed via the batadv generic -netlink family. batctl provides human readable version via its debug tables +netlink family. batctl provides a human readable version via its debug tables subcommands. @@ -115,8 +115,8 @@ are prefixed with "batman-adv:" So to see just these messages try:: $ dmesg | grep batman-adv When investigating problems with your mesh network, it is sometimes necessary to -see more detail debug messages. This must be enabled when compiling the -batman-adv module. When building batman-adv as part of kernel, use "make +see more detailed debug messages. This must be enabled when compiling the +batman-adv module. When building batman-adv as part of the kernel, use "make menuconfig" and enable the option ``B.A.T.M.A.N. debugging`` (``CONFIG_BATMAN_ADV_DEBUG=y``). diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 0ae34c85ef9e..9c8604c5b5f6 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -72,8 +72,8 @@ enum batadv_subtype { /** * enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets - * @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was - * previously received from someone else than the best neighbor. + * @BATADV_NOT_BEST_NEXT_HOP: flag is set when the ogm packet is forwarded and + * was previously received from someone other than the best neighbor. * @BATADV_PRIMARIES_FIRST_HOP: flag unused. * @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a * one hop neighbor on the interface where it was originally received. @@ -195,8 +195,8 @@ struct batadv_bla_claim_dst { /** * struct batadv_ogm_packet - ogm (routing protocol) packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @flags: contains routing relevant flags - see enum batadv_iv_flags * @seqno: sequence identification * @orig: address of the source node @@ -247,7 +247,7 @@ struct batadv_ogm2_packet { /** * struct batadv_elp_packet - elp (neighbor discovery) packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header + * @version: batman-adv protocol version, part of the general header * @orig: originator mac address * @seqno: sequence number * @elp_interval: currently used ELP sending interval in ms @@ -265,15 +265,15 @@ struct batadv_elp_packet { /** * struct batadv_icmp_header - common members among all the ICMP packets * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node * @uid: local ICMP socket identifier * @align: not used - useful for alignment purposes only * - * This structure is used for ICMP packets parsing only and it is never sent + * This structure is used for ICMP packet parsing only and it is never sent * over the wire. The alignment field at the end is there to ensure that * members are padded the same way as they are in real packets. */ @@ -291,8 +291,8 @@ struct batadv_icmp_header { /** * struct batadv_icmp_packet - ICMP packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -315,8 +315,8 @@ struct batadv_icmp_packet { /** * struct batadv_icmp_tp_packet - ICMP TP Meter packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -358,8 +358,8 @@ enum batadv_icmp_tp_subtype { /** * struct batadv_icmp_packet_rr - ICMP RouteRecord packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @msg_type: ICMP packet type * @dst: address of the destination node * @orig: address of the source node @@ -397,8 +397,8 @@ struct batadv_icmp_packet_rr { /** * struct batadv_unicast_packet - unicast packet for network payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @ttvn: translation table version number * @dest: originator destination of the unicast packet */ @@ -433,8 +433,8 @@ struct batadv_unicast_4addr_packet { /** * struct batadv_frag_packet - fragmented packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @dest: final destination used when routing fragments * @orig: originator of the fragment used when merging the packet * @no: fragment number within this sequence @@ -467,8 +467,8 @@ struct batadv_frag_packet { /** * struct batadv_bcast_packet - broadcast packet for network payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @reserved: reserved byte for alignment * @seqno: sequence identification * @orig: originator of the broadcast packet @@ -488,10 +488,10 @@ struct batadv_bcast_packet { /** * struct batadv_coded_packet - network coded packet * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @first_source: original source of first included packet - * @first_orig_dest: original destinal of first included packet + * @first_orig_dest: original destination of first included packet * @first_crc: checksum of first included packet * @first_ttvn: tt-version number of first included packet * @second_ttl: ttl of second packet @@ -523,8 +523,8 @@ struct batadv_coded_packet { /** * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload * @packet_type: batman-adv packet type, part of the general header - * @version: batman-adv protocol version, part of the genereal header - * @ttl: time to live for this packet, part of the genereal header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header * @reserved: reserved field (for packet alignment) * @src: address of the source * @dst: address of the destination diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 617c180ff0c8..8cf2ad11ead9 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -69,7 +69,7 @@ enum batadv_tt_client_flags { /** * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be - * part of the network but no nnode has already announced it + * part of the network but no node has already announced it */ BATADV_TT_CLIENT_TEMP = (1 << 11), }; @@ -131,7 +131,7 @@ enum batadv_gw_modes { /** @BATADV_GW_MODE_CLIENT: send DHCP requests to gw servers */ BATADV_GW_MODE_CLIENT, - /** @BATADV_GW_MODE_SERVER: announce itself as gatway server */ + /** @BATADV_GW_MODE_SERVER: announce itself as gateway server */ BATADV_GW_MODE_SERVER, }; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index e87f19c82e8d..5b3a41983156 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -134,7 +134,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) * * Return: the originator object corresponding to the passed mac address or NULL * on failure. - * If the object does not exists it is created an initialised. + * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -871,7 +871,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) } /** - * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over iterface + * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast @@ -1554,7 +1554,7 @@ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) @@ -2288,7 +2288,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information - * @single_hardif: Limit dump to this hard interfaace + * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 0bdefa35da98..d35aca0e969a 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -60,7 +60,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) * @neigh: the neighbour for which the throughput has to be obtained * * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc). + * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). */ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { @@ -183,8 +183,8 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) * * Sends a predefined number of unicast wifi packets to a given neighbour in * order to trigger the throughput estimation on this link by the RC algorithm. - * Packets are sent only if there there is not enough payload unicast traffic - * towards this neighbour.. + * Packets are sent only if there is not enough payload unicast traffic towards + * this neighbour.. * * Return: True on success and false in case of error during skb preparation. */ @@ -244,7 +244,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * batadv_v_elp_periodic_work() - ELP periodic task per interface * @work: work queue item * - * Emits broadcast ELP message in regular intervals. + * Emits broadcast ELP messages in regular intervals. */ static void batadv_v_elp_periodic_work(struct work_struct *work) { @@ -499,7 +499,7 @@ orig_free: * @skb: the received packet * @if_incoming: the interface this packet was received through * - * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly + * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly * processed or NET_RX_DROP in case of failure. */ int batadv_v_elp_packet_recv(struct sk_buff *skb, diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 18028b9f95f0..0d404f7bcd9f 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -47,9 +47,9 @@ * @bat_priv: the bat priv with all the soft interface information * @addr: the address of the originator * - * Return: the orig_node corresponding to the specified address. If such object - * does not exist it is allocated here. In case of allocation failure returns - * NULL. + * Return: the orig_node corresponding to the specified address. If such an + * object does not exist, it is allocated here. In case of allocation failure + * returns NULL. */ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) @@ -172,7 +172,7 @@ static bool batadv_v_ogm_queue_left(struct sk_buff *skb, * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue * @hard_iface: the interface holding the aggregation queue * - * Empties the OGMv2 aggregation queue and frees all the skbs it contained. + * Empties the OGMv2 aggregation queue and frees all the skbs it contains. * * Caller needs to hold the hard_iface->bat_v.aggr_list.lock. */ @@ -378,7 +378,7 @@ static void batadv_v_ogm_send(struct work_struct *work) * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface * @work: work queue item * - * Emits aggregated OGM message in regular intervals. + * Emits aggregated OGM messages in regular intervals. */ void batadv_v_ogm_aggr_work(struct work_struct *work) { @@ -399,7 +399,7 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) * batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V * @hard_iface: the interface to prepare * - * Takes care of scheduling own OGM sending routine for this interface. + * Takes care of scheduling its own OGM sending routine for this interface. * * Return: 0 on success or a negative error code otherwise */ @@ -847,7 +847,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, * batadv_v_ogm_process() - process an incoming batman v OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) - * @if_incoming: the interface where this packet was receved + * @if_incoming: the interface where this packet was received */ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 41cc87f06b14..91a04ca373dc 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -992,7 +992,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * - * checks if it is a claim packet and if its on the same group. + * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * @@ -1757,7 +1757,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, - * throw an uevent and log the event if that is the case. + * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. @@ -1815,7 +1815,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, * * we have to race for a claim * * if the frame is allowed on the LAN * - * in these cases, the skb is further handled by this function + * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b85da4b7a77b..0e6e53e9b5f3 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -666,7 +666,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * - * This function copies the skb with pskb_copy() and is sent as unicast packet + * This function copies the skb with pskb_copy() and is sent as a unicast packet * to each of the selected candidates. * * Return: true if the packet is sent to at least one candidate, false diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 7cad97644d05..9fdbe3068153 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -102,8 +102,8 @@ static int batadv_frag_size_limit(void) * * Caller must hold chain->lock. * - * Return: true if chain is empty and caller can just insert the new fragment - * without searching for the right position. + * Return: true if chain is empty and the caller can just insert the new + * fragment without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, u16 seqno) @@ -306,7 +306,7 @@ free: * set *skb to merged packet; 2) Packet is buffered: Return true and set *skb * to NULL; 3) Error: Return false and free skb. * - * Return: true when packet is merged or buffered, false when skb is not not + * Return: true when the packet is merged or buffered, false when skb is not not * used. */ bool batadv_frag_skb_buffer(struct sk_buff **skb, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 3a256af92784..53c27c67cc11 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -138,10 +138,10 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it - * is important to prevent this new interface to be used to create a new mesh - * network (this behaviour would lead to a batman-over-batman configuration). - * This function recursively checks all the fathers of the device passed as - * argument looking for a batman-adv soft interface. + * is important to prevent this new interface from being used to create a new + * mesh network (this behaviour would lead to a batman-over-batman + * configuration). This function recursively checks all the fathers of the + * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise @@ -680,8 +680,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) * @slave: the interface enslaved in another master * @master: the master from which slave has to be removed * - * Invoke ndo_del_slave on master passing slave as argument. In this way slave - * is free'd and master can correctly change its internal state. + * Invoke ndo_del_slave on master passing slave as argument. In this way the + * slave is free'd and the master can correctly change its internal state. * * Return: 0 on success, a negative value representing the error otherwise */ @@ -818,7 +818,7 @@ err: * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be - * off when another functions is modifying the list at the same time. The + * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index f9884dc56cf3..979864c0fa6b 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -69,7 +69,7 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...) __printf(2, 3); /** - * _batadv_dbg() - Store debug output with(out) ratelimiting + * _batadv_dbg() - Store debug output with(out) rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @ratelimited: whether output should be rate limited @@ -95,7 +95,7 @@ static inline void _batadv_dbg(int type __always_unused, #endif /** - * batadv_dbg() - Store debug output without ratelimiting + * batadv_dbg() - Store debug output without rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments @@ -104,7 +104,7 @@ static inline void _batadv_dbg(int type __always_unused, _batadv_dbg(type, bat_priv, 0, ## arg) /** - * batadv_dbg_ratelimited() - Store debug output with ratelimiting + * batadv_dbg_ratelimited() - Store debug output with rate limiting * @type: type of debug message * @bat_priv: the bat priv with all the soft interface information * @arg: format string and variable arguments diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d8a255c85e77..519c08c2cfba 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -666,7 +666,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * - * Return: true if AP isolation is on for the VLAN idenfied by vid, false + * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 42b8d1e76dea..0393bb9ed3d0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -308,7 +308,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: @@ -330,11 +330,11 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, /** * batadv_seq_after() - Checks if a sequence number x is a successor of y - * @x: potential sucessor of @y + * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor - * unless the variable sequence number has grown by more then + * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 9ebdc1e864b9..bdc4a1fba1c6 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -510,7 +510,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * - * If there is a bridge interface on top of dev, collects from that one + * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. @@ -832,8 +832,8 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * - * Whenever the multicast TVLV flags this nodes announces change this notifies - * userspace via the 'mcast' log level. + * Whenever the multicast TVLV flags this node announces change, this function + * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { @@ -1244,7 +1244,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or - * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and + * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and * increases its refcount. */ static struct batadv_orig_node * @@ -1693,7 +1693,7 @@ batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, } /** - * batadv_mcast_forw_send() - send packet to any detected multicast recpient + * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier @@ -1742,7 +1742,8 @@ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, - * orig, has toggled then this method updates counter and list accordingly. + * orig, has toggled then this method updates the counter and the list + * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1787,7 +1788,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1832,7 +1833,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1877,7 +1878,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ @@ -1922,7 +1923,7 @@ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has - * toggled then this method updates counter and list accordingly. + * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 02ed073f95a9..cfb00dfa468a 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -640,7 +640,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) * @bat_priv: the bat priv with all the soft interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop - * @test_time: total time ot the tp_meter session + * @test_time: total time of the tp_meter session * @total_bytes: bytes acked to the receiver * @cookie: cookie of tp_meter session * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b0469d15da0e..48d707850f3e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -134,7 +134,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, } /** - * batadv_nc_mesh_init() - initialise coding hash table and start house keeping + * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success or negative error number in case of failure @@ -700,7 +700,7 @@ batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, } /** - * batadv_nc_worker() - periodic task for house keeping related to network + * batadv_nc_worker() - periodic task for housekeeping related to network * coding * @work: kernel work struct */ @@ -1316,7 +1316,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, } /** - * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of + * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). * @bat_priv: the bat priv with all the soft interface information * @skb: data skb to forward @@ -1402,10 +1402,10 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, * @neigh_node: next hop to forward packet to * @ethhdr: pointer to the ethernet header inside the skb * - * Loops through list of neighboring nodes the next hop has a good connection to - * (receives OGMs with a sufficient quality). We need to find a neighbor of our - * next hop that potentially sent a packet which our next hop also received - * (overheard) and has stored for later decoding. + * Loops through the list of neighboring nodes the next hop has a good + * connection to (receives OGMs with a sufficient quality). We need to find a + * neighbor of our next hop that potentially sent a packet which our next hop + * also received (overheard) and has stored for later decoding. * * Return: true if the skb was consumed (encoded packet sent) or false otherwise */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 5b0c2fffc214..805d8969bdfb 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -325,7 +325,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to * - * Return: the neighbor which should be router for this orig_node/iface. + * Return: the neighbor which should be the router for this orig_node/iface. * * The object is returned with refcounter increased by 1. */ @@ -515,7 +515,7 @@ out: * Looks for and possibly returns a neighbour belonging to this originator list * which is connected through the provided hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ static struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, @@ -620,7 +620,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface, * * Looks for and possibly returns a neighbour belonging to this hard interface. * - * Return: neighbor when found. Othwerwise NULL + * Return: neighbor when found. Otherwise NULL */ struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, @@ -999,7 +999,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the originator * - * Creates a new originator object and initialise all the generic fields. + * Creates a new originator object and initialises all the generic fields. * The new object is not added to the originator list. * * Return: the newly created object or NULL on failure. diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index d343382e9664..27cdf5e4349a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -449,7 +449,7 @@ free_skb: * @skb: packet to check * @hdr_size: size of header to pull * - * Check for short header and bad addresses in given packet. + * Checks for short header and bad addresses in the given packet. * * Return: negative value when check fails and 0 otherwise. The negative value * depends on the reason: -ENODATA for bad header, -EBADR for broadcast @@ -1113,7 +1113,7 @@ free_skb: * @recv_if: interface that the skb is received on * * This function does one of the three following things: 1) Forward fragment, if - * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till + * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still * lack further fragments; 3) Merge fragments, if we have all needed parts. * * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 7f8ade04e08e..d267b94800d6 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -605,8 +605,8 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * - * The packets are being moved from the forw_list to the cleanup_list and - * by that allows already running threads to notice the claiming. + * The packets are being moved from the forw_list to the cleanup_list. This + * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index f1f1c86f3419..23833a0ba5e6 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -406,7 +406,7 @@ end: * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * - * Sends a ethernet frame to the receive path of the local @soft_iface. + * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bd2ac570c42c..db7e3774825b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -66,7 +66,7 @@ /** * BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond - * such amound of milliseconds, the receiver is considered unreachable and the + * such amount of milliseconds, the receiver is considered unreachable and the * connection is killed */ #define BATADV_TP_MAX_RTO 30000 @@ -108,10 +108,10 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid) * batadv_tp_cwnd() - compute the new cwnd size * @base: base cwnd size value * @increment: the value to add to base to get the new size - * @min: minumim cwnd value (usually MSS) + * @min: minimum cwnd value (usually MSS) * - * Return the new cwnd size and ensures it does not exceed the Advertised - * Receiver Window size. It is wrap around safe. + * Return the new cwnd size and ensure it does not exceed the Advertised + * Receiver Window size. It is wrapped around safely. * For details refer to Section 3.1 of RFC5681 * * Return: new congestion window size in bytes @@ -254,7 +254,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after - * having incremented the refcounter. Return NULL is not found + * having increment the refcounter. Return NULL is not found * * Return: matching tp_vars or NULL when no tp_vars with @dst was found */ @@ -291,7 +291,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, * @session: session identifier * * Look for a tp_vars object matching dst as end_point, session as tp meter - * session and return it after having incremented the refcounter. Return NULL + * session and return it after having increment the refcounter. Return NULL * is not found * * Return: matching tp_vars or NULL when no tp_vars was found diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index a9635c882fe0..98a0aaaf0d50 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -301,7 +301,7 @@ void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) * @vid: VLAN identifier * * Return: the number of originators advertising the given address/data - * (excluding ourself). + * (excluding our self). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) @@ -842,7 +842,7 @@ out: * table. In case of success the value is updated with the real amount of * reserved bytes * Allocate the needed amount of memory for the entire TT TVLV and write its - * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data + * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data * objects, one per active VLAN served by the originator node. * * Return: the size of the allocated buffer or 0 in case of failure. @@ -1674,7 +1674,7 @@ out: * the function argument. * If a TT local entry exists for this non-mesh client remove it. * - * The caller must hold orig_node refcount. + * The caller must hold the orig_node refcount. * * Return: true if the new entry has been added, false otherwise */ @@ -1839,7 +1839,7 @@ out: * @bat_priv: the bat priv with all the soft interface information * @tt_global_entry: global translation table entry to be analyzed * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). * Return: best originator list entry or NULL on errors. */ static struct batadv_tt_orig_list_entry * @@ -1887,7 +1887,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, * @tt_global_entry: global translation table entry to be printed * @seq: debugfs table seq_file struct * - * This functon assumes the caller holds rcu_read_lock(). + * This function assumes the caller holds rcu_read_lock(). */ static void batadv_tt_global_print_entry(struct batadv_priv *bat_priv, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 0963a43ad996..6a23a566cde1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -353,8 +353,8 @@ end: * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * - * Return: success if handler was not found or the return value of the handler - * callback. + * Return: success if the handler was not found or the return value of the + * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index d152b8e81f61..cc151e1f23b2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -455,8 +455,8 @@ struct batadv_orig_node { spinlock_t tt_buff_lock; /** - * @tt_lock: prevents from updating the table while reading it. Table - * update is made up by two operations (data structure update and + * @tt_lock: avoids concurrent read from and write to the table. Table + * update is made up of two operations (data structure update and * metadata -CRC/TTVN-recalculation) and they have to be executed * atomically in order to avoid another thread to read the * table/metadata between those. @@ -748,7 +748,7 @@ struct batadv_neigh_ifinfo { * struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression */ struct batadv_bcast_duplist_entry { - /** @orig: mac address of orig node orginating the broadcast */ + /** @orig: mac address of orig node originating the broadcast */ u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ @@ -1010,7 +1010,7 @@ struct batadv_priv_tt { /** * @commit_lock: prevents from executing a local TT commit while reading - * the local table. The local TT commit is made up by two operations + * the local table. The local TT commit is made up of two operations * (data structure update and metadata -CRC/TTVN- recalculation) and * they have to be executed atomically in order to avoid another thread * to read the table/metadata between those. @@ -1024,7 +1024,7 @@ struct batadv_priv_tt { #ifdef CONFIG_BATMAN_ADV_BLA /** - * struct batadv_priv_bla - per mesh interface bridge loope avoidance data + * struct batadv_priv_bla - per mesh interface bridge loop avoidance data */ struct batadv_priv_bla { /** @num_requests: number of bla requests in flight */ @@ -1718,7 +1718,7 @@ struct batadv_priv { spinlock_t softif_vlan_list_lock; #ifdef CONFIG_BATMAN_ADV_BLA - /** @bla: bridge loope avoidance data */ + /** @bla: bridge loop avoidance data */ struct batadv_priv_bla bla; #endif -- cgit v1.2.3 From 3bda14d09dc5789a895ab02b7dcfcec19b0a65b3 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 1 Jun 2020 22:35:22 +0200 Subject: batman-adv: Introduce a configurable per interface hop penalty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some setups multiple hard interfaces with similar link qualities or throughput values are available. But people have expressed the desire to consider one of them as a backup only. Some creative solutions are currently in use: Such people are configuring multiple batman-adv mesh/soft interfaces, wire them together with some veth pairs and then tune the hop penalty to achieve an effect similar to a tunable per interface hop penalty. This patch introduces a new, configurable, per hard interface hop penalty to simplify such setups. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 3 ++- net/batman-adv/bat_iv_ogm.c | 17 +++++++++-------- net/batman-adv/bat_v_ogm.c | 13 ++++++++++--- net/batman-adv/hard-interface.c | 2 ++ net/batman-adv/netlink.c | 12 +++++++++++- net/batman-adv/types.h | 6 ++++++ 6 files changed, 40 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 8cf2ad11ead9..bb0ae945b36a 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -427,7 +427,8 @@ enum batadv_nl_attrs { /** * @BATADV_ATTR_HOP_PENALTY: defines the penalty which will be applied - * to an originator message's tq-field on every hop. + * to an originator message's tq-field on every hop and/or per + * hard interface */ BATADV_ATTR_HOP_PENALTY, diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5b3a41983156..a4faf5f904d9 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1075,10 +1075,10 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - unsigned int tq_iface_penalty; bool ret = false; /* find corresponding one hop neighbor */ @@ -1157,31 +1157,32 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; + tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ - tq_iface_penalty = BATADV_TQ_MAX_VALUE; if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) - tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, - bat_priv); + tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, + bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * - tq_iface_penalty; + tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", + "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, - neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty, - batadv_ogm_packet->tq, if_incoming->net_dev->name, + neigh_rq_count, tq_own, tq_asym_penalty, + tq_iface_hop_penalty, batadv_ogm_packet->tq, + if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 0d404f7bcd9f..0f8495b9eeb1 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -455,15 +455,17 @@ unlock: * @throughput: the current throughput * * Apply a penalty on the current throughput metric value based on the - * characteristic of the interface where the OGM has been received. The return - * value is computed as follows: + * characteristic of the interface where the OGM has been received. + * + * Initially the per hardif hop penalty is applied to the throughput. After + * that the return value is then computed as follows: * - throughput * 50% if the incoming and outgoing interface are the * same WiFi interface and the throughput is above * 1MBit/s * - throughput if the outgoing interface is the default * interface (i.e. this OGM is processed for the * internal table and not forwarded) - * - throughput * hop penalty otherwise + * - throughput * node hop penalty otherwise * * Return: the penalised throughput metric. */ @@ -472,9 +474,14 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, u32 throughput) { + int if_hop_penalty = atomic_read(&if_incoming->hop_penalty); int hop_penalty = atomic_read(&bat_priv->hop_penalty); int hop_penalty_max = BATADV_TQ_MAX_VALUE; + /* Apply per hardif hop penalty */ + throughput = throughput * (hop_penalty_max - if_hop_penalty) / + hop_penalty_max; + /* Don't apply hop penalty in default originator table. */ if (if_outgoing == BATADV_IF_DEFAULT) return throughput; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 53c27c67cc11..fa06b51c0144 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -939,6 +939,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; + atomic_set(&hard_iface->hop_penalty, 0); + batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index cfb00dfa468a..dc193618a761 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -826,6 +826,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, + atomic_read(&hard_iface->hop_penalty))) + goto nla_put_failure; + #ifdef CONFIG_BATMAN_ADV_BATMAN_V if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) @@ -920,9 +924,15 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { + attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; + + atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr)); + } #ifdef CONFIG_BATMAN_ADV_BATMAN_V - struct nlattr *attr; if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index cc151e1f23b2..ed519efa3c36 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -208,6 +208,12 @@ struct batadv_hard_iface { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; + /** + * @hop_penalty: penalty which will be applied to the tq-field + * of an OGM received via this interface + */ + atomic_t hop_penalty; + /** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */ struct batadv_hard_iface_bat_iv bat_iv; -- cgit v1.2.3 From fe80536acf8397827be77f9b8ada384b90e790d0 Mon Sep 17 00:00:00 2001 From: Martin Date: Sun, 28 Jun 2020 23:18:23 +0530 Subject: bareudp: Added attribute to enable & disable rx metadata collection Metadata need not be collected in receive if the packet from bareudp device is not targeted to openvswitch. Signed-off-by: Martin Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 6 ++++-- drivers/net/bareudp.c | 23 +++++++++++++++++------ include/net/bareudp.h | 1 + include/uapi/linux/if_link.h | 1 + 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 465a8b251bfe..0e00636d8d74 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -48,5 +48,7 @@ enabled. The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before sending packet buffer to the bareudp device for transmission. On reception the -bareudp device extracts and stores the tunnel information in SKB dst field before -passing the packet buffer to the network stack. +bareudp device decapsulates the udp header and passes the inner packet to the +network stack. If RX_COLLECT_METADATA flag is enabled in the device the tunnel +information will be stored in the SKB dst field before the packet buffer is +passed to the network stack. diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 3dd46cd55114..108a8cafc4f8 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -46,6 +46,7 @@ struct bareudp_dev { __be16 port; u16 sport_min; bool multi_proto_mode; + bool rx_collect_metadata; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -125,13 +126,14 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) bareudp->dev->stats.rx_dropped++; goto drop; } - - tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); - if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; - goto drop; + if (bareudp->rx_collect_metadata) { + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + skb_dst_set(skb, &tun_dst->dst); } - skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -575,6 +577,9 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; + if (data[IFLA_BAREUDP_RX_COLLECT_METADATA]) + conf->rx_collect_metadata = true; + return 0; } @@ -612,6 +617,8 @@ static int bareudp_configure(struct net *net, struct net_device *dev, bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; + bareudp->rx_collect_metadata = conf->rx_collect_metadata; + err = register_netdevice(dev); if (err) return err; @@ -669,6 +676,7 @@ static size_t bareudp_get_size(const struct net_device *dev) nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ + nla_total_size(0) + /* IFLA_BAREUDP_RX_COLLECT_METADATA */ 0; } @@ -685,6 +693,9 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; + if (bareudp->rx_collect_metadata && + nla_put_flag(skb, IFLA_BAREUDP_RX_COLLECT_METADATA)) + goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index dc65a0d71d9b..3dd5f9a8d01c 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -12,6 +12,7 @@ struct bareudp_conf { __be16 port; u16 sport_min; bool multi_proto_mode; + bool rx_collect_metadata; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a009365ad67b..cc185a007ade 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -600,6 +600,7 @@ enum { IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, + IFLA_BAREUDP_RX_COLLECT_METADATA, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3 From aee9caa03fc3c8b02f8f31824354d85f30e562e0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sat, 27 Jun 2020 01:45:28 +0300 Subject: net: sched: sch_red: Add qevents "early_drop" and "mark" In order to allow acting on dropped and/or ECN-marked packets, add two new qevents to the RED qdisc: "early_drop" and "mark". Filters attached at "early_drop" block are executed as packets are early-dropped, those attached at the "mark" block are executed as packets are ECN-marked. Two new attributes are introduced: TCA_RED_EARLY_DROP_BLOCK with the block index for the "early_drop" qevent, and TCA_RED_MARK_BLOCK for the "mark" qevent. Absence of these attributes signifies "don't care": no block is allocated in that case, or the existing blocks are left intact in case of the change callback. For purposes of offloading, blocks attached to these qevents appear with newly-introduced binder types, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP and FLOW_BLOCK_BINDER_TYPE_RED_MARK. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/flow_offload.h | 2 ++ include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_red.c | 58 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 60 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3bafb5124ac0..3e793ac66baf 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -424,6 +424,8 @@ enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a95f3ae7ab37..9e7c2c607845 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -257,6 +257,8 @@ enum { TCA_RED_STAB, TCA_RED_MAX_P, TCA_RED_FLAGS, /* bitfield32 */ + TCA_RED_EARLY_DROP_BLOCK, /* u32 */ + TCA_RED_MARK_BLOCK, /* u32 */ __TCA_RED_MAX, }; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 225ce370e5a8..de2be4d04ed6 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -46,6 +46,8 @@ struct red_sched_data { struct red_vars vars; struct red_stats stats; struct Qdisc *qdisc; + struct tcf_qevent qe_early_drop; + struct tcf_qevent qe_mark; }; #define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) @@ -92,6 +94,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.prob_drop++; goto congestion_drop; @@ -109,6 +114,9 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; + skb = tcf_qevent_handle(&q->qe_mark, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; } else if (!red_use_nodrop(q)) { q->stats.forced_drop++; goto congestion_drop; @@ -129,6 +137,10 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_ return ret; congestion_drop: + skb = tcf_qevent_handle(&q->qe_early_drop, sch, skb, root_lock, to_free, &ret); + if (!skb) + return NET_XMIT_CN | ret; + qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; } @@ -202,6 +214,8 @@ static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + tcf_qevent_destroy(&q->qe_mark, sch); + tcf_qevent_destroy(&q->qe_early_drop, sch); del_timer_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); @@ -213,6 +227,8 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), + [TCA_RED_EARLY_DROP_BLOCK] = { .type = NLA_U32 }, + [TCA_RED_MARK_BLOCK] = { .type = NLA_U32 }, }; static int __red_change(struct Qdisc *sch, struct nlattr **tb, @@ -328,12 +344,38 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt, q->qdisc = &noop_qdisc; q->sch = sch; timer_setup(&q->adapt_timer, red_adaptative_timer, 0); - return __red_change(sch, tb, extack); + + err = __red_change(sch, tb, extack); + if (err) + return err; + + err = tcf_qevent_init(&q->qe_early_drop, sch, + FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + goto err_early_drop_init; + + err = tcf_qevent_init(&q->qe_mark, sch, + FLOW_BLOCK_BINDER_TYPE_RED_MARK, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + goto err_mark_init; + + return 0; + +err_mark_init: + tcf_qevent_destroy(&q->qe_early_drop, sch); +err_early_drop_init: + del_timer_sync(&q->adapt_timer); + red_offload(sch, false); + qdisc_put(q->qdisc); + return err; } static int red_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct red_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_RED_MAX + 1]; int err; @@ -345,6 +387,16 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + err = tcf_qevent_validate_change(&q->qe_early_drop, + tb[TCA_RED_EARLY_DROP_BLOCK], extack); + if (err) + return err; + + err = tcf_qevent_validate_change(&q->qe_mark, + tb[TCA_RED_MARK_BLOCK], extack); + if (err) + return err; + return __red_change(sch, tb, extack); } @@ -389,7 +441,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, TC_RED_SUPPORTED_FLAGS)) + q->flags, TC_RED_SUPPORTED_FLAGS) || + tcf_qevent_dump(skb, TCA_RED_MARK_BLOCK, &q->qe_mark) || + tcf_qevent_dump(skb, TCA_RED_EARLY_DROP_BLOCK, &q->qe_early_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); -- cgit v1.2.3 From ecc31c60240b9808a274befc5db6b8a249a6ade1 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Mon, 29 Jun 2020 23:46:16 +0300 Subject: ethtool: Add link extended state Currently, drivers can only tell whether the link is up/down using LINKSTATE_GET, but no additional information is given. Add attributes to LINKSTATE_GET command in order to allow drivers to expose the user more information in addition to link state to ease the debug process, for example, reason for link down state. Extended state consists of two attributes - link_ext_state and link_ext_substate. The idea is to avoid 'vendor specific' states in order to prevent drivers to use specific link_ext_state that can be in the future common link_ext_state. The substates allows drivers to add more information to the common link_ext_state. For example, vendor can expose 'Autoneg' as link_ext_state and add 'No partner detected during force mode' as link_ext_substate. If a driver cannot pinpoint the extended state with the substate accuracy, it is free to expose only the extended state and omit the substate attribute. Signed-off-by: Amit Cohen Reviewed-by: Jiri Pirko Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/ethtool.h | 23 ++++++++++++ include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 2 ++ net/ethtool/linkstate.c | 52 ++++++++++++++++++++++++--- 4 files changed, 143 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a23b26eab479..48ad3b6a0150 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -86,6 +86,22 @@ struct net_device; u32 ethtool_op_get_link(struct net_device *dev); int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti); + +/** + * struct ethtool_link_ext_state_info - link extended state and substate. + */ +struct ethtool_link_ext_state_info { + enum ethtool_link_ext_state link_ext_state; + union { + enum ethtool_link_ext_substate_autoneg autoneg; + enum ethtool_link_ext_substate_link_training link_training; + enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; + enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; + enum ethtool_link_ext_substate_cable_issue cable_issue; + u8 __link_ext_substate; + }; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -245,6 +261,11 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * @get_link: Report whether physical link is up. Will only be called if * the netdev is up. Should usually be set to ethtool_op_get_link(), * which uses netif_carrier_ok(). + * @get_link_ext_state: Report link extended state. Should set link_ext_state and + * link_ext_substate (link_ext_substate of 0 means link_ext_substate is unknown, + * do not attach ext_substate attribute to netlink message). If link_ext_state + * and link_ext_substate are unknown, return -ENODATA. If not implemented, + * link_ext_state and link_ext_substate will not be sent to userspace. * @get_eeprom: Read data from the device EEPROM. * Should fill in the magic field. Don't need to check len for zero * or wraparound. Fill in the data argument with the eeprom values @@ -384,6 +405,8 @@ struct ethtool_ops { void (*set_msglevel)(struct net_device *, u32); int (*nway_reset)(struct net_device *); u32 (*get_link)(struct net_device *); + int (*get_link_ext_state)(struct net_device *, + struct ethtool_link_ext_state_info *); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f4662b3a9e1e..d1413538ef30 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -579,6 +579,76 @@ struct ethtool_pauseparam { __u32 tx_pause; }; +/** + * enum ethtool_link_ext_state - link extended state + */ +enum ethtool_link_ext_state { + ETHTOOL_LINK_EXT_STATE_AUTONEG, + ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE, + ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH, + ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY, + ETHTOOL_LINK_EXT_STATE_NO_CABLE, + ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE, + ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE, + ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, + ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, + ETHTOOL_LINK_EXT_STATE_OVERHEAT, +}; + +/** + * enum ethtool_link_ext_substate_autoneg - more information in addition to + * ETHTOOL_LINK_EXT_STATE_AUTONEG. + */ +enum ethtool_link_ext_substate_autoneg { + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE, + ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD, +}; + +/** + * enum ethtool_link_ext_substate_link_training - more information in addition to + * ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE. + */ +enum ethtool_link_ext_substate_link_training { + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT, + ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY, + ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT, +}; + +/** + * enum ethtool_link_ext_substate_logical_mismatch - more information in addition + * to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH. + */ +enum ethtool_link_ext_substate_link_logical_mismatch { + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED, + ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED, +}; + +/** + * enum ethtool_link_ext_substate_bad_signal_integrity - more information in + * addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY. + */ +enum ethtool_link_ext_substate_bad_signal_integrity { + ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, +}; + +/** + * enum ethtool_link_ext_substate_cable_issue - more information in + * addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. + */ +enum ethtool_link_ext_substate_cable_issue { + ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1, + ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, +}; + #define ETH_GSTRING_LEN 32 /** diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 4dda5e4244a7..c12ce4df4b6b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -236,6 +236,8 @@ enum { ETHTOOL_A_LINKSTATE_LINK, /* u8 */ ETHTOOL_A_LINKSTATE_SQI, /* u32 */ ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ + ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index afe5ac8a0f00..4834091ec24c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -9,10 +9,12 @@ struct linkstate_req_info { }; struct linkstate_reply_data { - struct ethnl_reply_data base; - int link; - int sqi; - int sqi_max; + struct ethnl_reply_data base; + int link; + int sqi; + int sqi_max; + bool link_ext_state_provided; + struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -25,6 +27,8 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_STATE] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_EXT_SUBSTATE] = { .type = NLA_REJECT }, }; static int linkstate_get_sqi(struct net_device *dev) @@ -61,6 +65,23 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_unlock(&phydev->lock); return ret; +}; + +static int linkstate_get_link_ext_state(struct net_device *dev, + struct linkstate_reply_data *data) +{ + int err; + + if (!dev->ethtool_ops->get_link_ext_state) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info); + if (err) + return err; + + data->link_ext_state_provided = true; + + return 0; } static int linkstate_prepare_data(const struct ethnl_req_info *req_base, @@ -86,6 +107,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; data->sqi_max = ret; + if (dev->flags & IFF_UP) { + ret = linkstate_get_link_ext_state(dev, data); + if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA) + goto out; + } + ret = 0; out: ethnl_ops_complete(dev); @@ -107,6 +134,12 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->sqi_max != -EOPNOTSUPP) len += nla_total_size(sizeof(u32)); + if (data->link_ext_state_provided) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ + + if (data->ethtool_link_ext_state_info.__link_ext_substate) + len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + return len; } @@ -128,6 +161,17 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) return -EMSGSIZE; + if (data->link_ext_state_provided) { + if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, + data->ethtool_link_ext_state_info.link_ext_state)) + return -EMSGSIZE; + + if (data->ethtool_link_ext_state_info.__link_ext_substate && + nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + data->ethtool_link_ext_state_info.__link_ext_substate)) + return -EMSGSIZE; + } + return 0; } -- cgit v1.2.3 From fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Jun 2020 23:28:44 -0700 Subject: bpf: Introduce helper bpf_get_task_stack() Introduce helper bpf_get_task_stack(), which dumps stack trace of given task. This is different to bpf_get_stack(), which gets stack track of current task. One potential use case of bpf_get_task_stack() is to call it from bpf_iter__task and dump all /proc//stack to a seq_file. bpf_get_task_stack() uses stack_trace_save_tsk() instead of get_perf_callchain() for kernel stack. The benefit of this choice is that stack_trace_save_tsk() doesn't require changes in arch/. The downside of using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the stack trace to unsigned long array. For 32-bit systems, we need to translate it to u64 array. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200630062846.664389-3-songliubraving@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 37 +++++++++++++++++++- kernel/bpf/stackmap.c | 77 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 37 +++++++++++++++++++- 7 files changed, 153 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3d2ade703a35..0cd7f6884c5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 071f98d0f7c6..5ad72ab2276b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } } +static struct perf_callchain_entry * +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +{ + struct perf_callchain_entry *entry; + int rctx; + + entry = get_callchain_entry(&rctx); + + if (!entry) + return NULL; + + entry->nr = init_nr + + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), + sysctl_perf_event_max_stack - init_nr, 0); + + /* stack_trace_save_tsk() works on unsigned long array, while + * perf_callchain_entry uses u64 array. For 32-bit systems, it is + * necessary to fix this mismatch. + */ + if (__BITS_PER_LONG != 64) { + unsigned long *from = (unsigned long *) entry->ip; + u64 *to = entry->ip; + int i; + + /* copy data from the end to avoid using extra buffer */ + for (i = entry->nr - 1; i >= (int)init_nr; i--) + to[i] = (u64)(from[i]); + } + + put_callchain_entry(rctx); + + return entry; +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { @@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, - u64, flags) +static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, + void *buf, u32 size, u64 flags) { u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, if (unlikely(size % elem_size)) goto clear; + /* cannot get valid user stack for task without user_mode regs */ + if (task && user && !user_mode(regs)) + goto err_fault; + num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + + if (kernel && task) + trace = get_callchain_entry_for_task(task, init_nr); + else + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, + false, false); if (unlikely(!trace)) goto err_fault; @@ -505,6 +548,12 @@ clear: return err; } +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, buf, size, flags); +} + const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, @@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + struct pt_regs *regs = task_pt_regs(task); + + return __bpf_get_stack(regs, task, buf, size, flags); +} + +static int bpf_get_task_stack_btf_ids[5]; +const struct bpf_func_proto bpf_get_task_stack_proto = { + .func = bpf_get_task_stack, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_get_task_stack_btf_ids, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7de98906ddf4..b608185e1ffd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; - if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + if ((func_id == BPF_FUNC_get_stack || + func_id == BPF_FUNC_get_task_stack) && + !env->prog->has_callchain_buf) { const char *err_str; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5d59dda5f661..977ba3b6f6c6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ringbuf_query_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6bab40ff442e..6843376733df 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -426,6 +426,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', 'struct __sk_buff', 'struct sk_msg_md', @@ -468,6 +469,7 @@ class PrinterHelpers(Printer): 'struct tcp_timewait_sock', 'struct tcp_request_sock', 'struct udp6_sock', + 'struct task_struct', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0cb8ec948816..da9bf35a26f8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3285,6 +3285,39 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return * *sk* if casting is valid, or NULL otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to struct task_struct. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3427,7 +3460,9 @@ union bpf_attr { FN(skc_to_tcp_sock), \ FN(skc_to_tcp_timewait_sock), \ FN(skc_to_tcp_request_sock), \ - FN(skc_to_udp6_sock), + FN(skc_to_udp6_sock), \ + FN(get_task_stack), \ + /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From e4266b991fead8eb996688e82ff39f6cc59ef7dd Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 2 Jul 2020 10:13:05 +0200 Subject: bridge: uapi: mrp: Extend MRP attributes to get the status Add MRP attribute IFLA_BRIDGE_MRP_INFO to allow the userspace to get the current state of the MRP instances. This is a nested attribute that contains other attributes like, ring id, index of primary and secondary port, priority, ring state, ring role. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index caa6914a3e53..c114c1c2bd53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -166,6 +166,7 @@ enum { IFLA_BRIDGE_MRP_RING_STATE, IFLA_BRIDGE_MRP_RING_ROLE, IFLA_BRIDGE_MRP_START_TEST, + IFLA_BRIDGE_MRP_INFO, __IFLA_BRIDGE_MRP_MAX, }; @@ -228,6 +229,22 @@ enum { #define IFLA_BRIDGE_MRP_START_TEST_MAX (__IFLA_BRIDGE_MRP_START_TEST_MAX - 1) +enum { + IFLA_BRIDGE_MRP_INFO_UNSPEC, + IFLA_BRIDGE_MRP_INFO_RING_ID, + IFLA_BRIDGE_MRP_INFO_P_IFINDEX, + IFLA_BRIDGE_MRP_INFO_S_IFINDEX, + IFLA_BRIDGE_MRP_INFO_PRIO, + IFLA_BRIDGE_MRP_INFO_RING_STATE, + IFLA_BRIDGE_MRP_INFO_RING_ROLE, + IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + __IFLA_BRIDGE_MRP_INFO_MAX, +}; + +#define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; -- cgit v1.2.3 From 36a8e8e26542056bbd7eb5e047cadee30587d230 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Thu, 2 Jul 2020 10:13:07 +0200 Subject: bridge: Extend br_fill_ifinfo to return MPR status This patch extends the function br_fill_ifinfo to return also the MRP status for each instance on a bridge. It also adds a new filter RTEXT_FILTER_MRP to return the MRP status only when this is set, not to interfer with the vlans. The MRP status is return only on the bridge interfaces. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/bridge/br_netlink.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 879e64950a0a..9b814c92de12 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -778,6 +778,7 @@ enum { #define RTEXT_FILTER_BRVLAN (1 << 1) #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2) #define RTEXT_FILTER_SKIP_STATS (1 << 3) +#define RTEXT_FILTER_MRP (1 << 4) /* End of information exported to user level */ diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 240e260e3461..c532fa65c983 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -453,6 +453,28 @@ static int br_fill_ifinfo(struct sk_buff *skb, rcu_read_unlock(); if (err) goto nla_put_failure; + + nla_nest_end(skb, af); + } + + if (filter_mask & RTEXT_FILTER_MRP) { + struct nlattr *af; + int err; + + if (!br_mrp_enabled(br) || port) + goto done; + + af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); + if (!af) + goto nla_put_failure; + + rcu_read_lock(); + err = br_mrp_fill_info(skb, br); + rcu_read_unlock(); + + if (err) + goto nla_put_failure; + nla_nest_end(skb, af); } @@ -516,7 +538,8 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_bridge_port *port = br_port_get_rtnl(dev); if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) && - !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) + !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) && + !(filter_mask & RTEXT_FILTER_MRP)) return 0; return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags, -- cgit v1.2.3 From 74cccc3d38438b346e40a4f8133cff3f0839ff84 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:11 +0200 Subject: netfilter: nf_tables: add NFTA_CHAIN_ID attribute This netlink attribute allows you to refer to chains inside a transaction as an alternative to the name and the handle. The chain binding support requires this new chain ID approach. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 15 ++++++++++++--- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6f0f6fca9ac3..3e5226684017 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1433,6 +1433,7 @@ struct nft_trans_chain { char *name; struct nft_stats __percpu *stats; u8 policy; + u32 chain_id; }; #define nft_trans_chain_update(trans) \ @@ -1443,6 +1444,8 @@ struct nft_trans_chain { (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) +#define nft_trans_chain_id(trans) \ + (((struct nft_trans_chain *)trans->data)->chain_id) struct nft_trans_table { bool update; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4565456c0ef4..477779595b78 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -196,6 +196,7 @@ enum nft_table_attributes { * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING) * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) * @NFTA_CHAIN_FLAGS: chain flags + * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -209,6 +210,7 @@ enum nft_chain_attributes { NFTA_CHAIN_COUNTERS, NFTA_CHAIN_PAD, NFTA_CHAIN_FLAGS, + NFTA_CHAIN_ID, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7647ecfa0d40..650ef0dd0773 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -280,9 +280,15 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) if (trans == NULL) return ERR_PTR(-ENOMEM); - if (msg_type == NFT_MSG_NEWCHAIN) + if (msg_type == NFT_MSG_NEWCHAIN) { nft_activate_next(ctx->net, ctx->chain); + if (ctx->nla[NFTA_CHAIN_ID]) { + nft_trans_chain_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID])); + } + } + list_add_tail(&trans->list, &ctx->net->nft.commit_list); return trans; } @@ -1274,6 +1280,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { .len = NFT_MODULE_AUTOLOAD_LIMIT }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, + [NFTA_CHAIN_ID] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -2154,9 +2161,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct nft_chain *chain = NULL; const struct nlattr *attr; struct nft_table *table; - struct nft_chain *chain; u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; @@ -2181,7 +2188,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } attr = nla[NFTA_CHAIN_HANDLE]; - } else { + } else if (nla[NFTA_CHAIN_NAME]) { chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { @@ -2190,6 +2197,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } chain = NULL; } + } else if (!nla[NFTA_CHAIN_ID]) { + return -EINVAL; } if (nla[NFTA_CHAIN_POLICY]) { -- cgit v1.2.3 From 837830a4b439bfeb86c70b0115c280377c84714b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:16 +0200 Subject: netfilter: nf_tables: add NFTA_RULE_CHAIN_ID attribute This new netlink attribute allows you to add rules to chains by the chain ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 36 ++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 477779595b78..2304d1b7ba5e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -240,6 +240,7 @@ enum nft_rule_attributes { NFTA_RULE_PAD, NFTA_RULE_ID, NFTA_RULE_POSITION_ID, + NFTA_RULE_CHAIN_ID, __NFTA_RULE_MAX }; #define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 650ef0dd0773..fbe8f9209813 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2153,6 +2153,22 @@ err: return err; } +static struct nft_chain *nft_chain_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWCHAIN && + id == nft_trans_chain_id(trans)) + return chain; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2633,6 +2649,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_RULE_ID] = { .type = NLA_U32 }, [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 }, + [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 }, }; static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, @@ -3039,10 +3056,21 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) { - NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); - return PTR_ERR(chain); + if (nla[NFTA_RULE_CHAIN]) { + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); + return PTR_ERR(chain); + } + } else if (nla[NFTA_RULE_CHAIN_ID]) { + chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]); + return PTR_ERR(chain); + } + } else { + return -EINVAL; } if (nla[NFTA_RULE_HANDLE]) { -- cgit v1.2.3 From 51d70f181ff4e2c996ddf256af1efecd7d5864e5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:21 +0200 Subject: netfilter: nf_tables: add NFTA_VERDICT_CHAIN_ID attribute This netlink attribute allows you to identify the chain to jump/goto by means of the chain ID. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2304d1b7ba5e..683e75126d68 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -471,11 +471,13 @@ enum nft_data_attributes { * * @NFTA_VERDICT_CODE: nf_tables verdict (NLA_U32: enum nft_verdicts) * @NFTA_VERDICT_CHAIN: jump target chain name (NLA_STRING) + * @NFTA_VERDICT_CHAIN_ID: jump target chain ID (NLA_U32) */ enum nft_verdict_attributes { NFTA_VERDICT_UNSPEC, NFTA_VERDICT_CODE, NFTA_VERDICT_CHAIN, + NFTA_VERDICT_CHAIN_ID, __NFTA_VERDICT_MAX }; #define NFTA_VERDICT_MAX (__NFTA_VERDICT_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fbe8f9209813..d86602797a69 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8242,6 +8242,7 @@ static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = { [NFTA_VERDICT_CODE] = { .type = NLA_U32 }, [NFTA_VERDICT_CHAIN] = { .type = NLA_STRING, .len = NFT_CHAIN_MAXNAMELEN - 1 }, + [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 }, }; static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, @@ -8278,10 +8279,19 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, break; case NFT_JUMP: case NFT_GOTO: - if (!tb[NFTA_VERDICT_CHAIN]) + if (tb[NFTA_VERDICT_CHAIN]) { + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], + genmask); + } else if (tb[NFTA_VERDICT_CHAIN_ID]) { + chain = nft_chain_lookup_byid(ctx->net, + tb[NFTA_VERDICT_CHAIN_ID]); + if (IS_ERR(chain)) + return PTR_ERR(chain); + } else { return -EINVAL; - chain = nft_chain_lookup(ctx->net, ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + } + if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) -- cgit v1.2.3 From 67c49de4ad862c567088c5119cf125e566f56e7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:25 +0200 Subject: netfilter: nf_tables: expose enum nft_chain_flags through UAPI This enum definition was never exposed through UAPI. Rename NFT_BASE_CHAIN to NFT_CHAIN_BASE for consistency. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +------ include/uapi/linux/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3e5226684017..6d1e7da6e00a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -921,11 +921,6 @@ static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, (expr) != (last); \ (expr) = nft_expr_next(expr)) -enum nft_chain_flags { - NFT_BASE_CHAIN = 0x1, - NFT_CHAIN_HW_OFFLOAD = 0x2, -}; - #define NFT_CHAIN_POLICY_UNSET U8_MAX /** @@ -1036,7 +1031,7 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai static inline bool nft_is_base_chain(const struct nft_chain *chain) { - return chain->flags & NFT_BASE_CHAIN; + return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 683e75126d68..2cf7cc3b50c1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -184,6 +184,11 @@ enum nft_table_attributes { }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) +enum nft_chain_flags { + NFT_CHAIN_BASE = (1 << 0), + NFT_CHAIN_HW_OFFLOAD = (1 << 1), +}; + /** * enum nft_chain_attributes - nf_tables chain netlink attributes * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d86602797a69..b7582a1c8dce 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1903,7 +1903,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, nft_basechain_hook_init(&basechain->ops, family, hook, chain); } - chain->flags |= NFT_BASE_CHAIN | flags; + chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && nft_chain_offload_priority(basechain) < 0) @@ -2255,7 +2255,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - flags |= chain->flags & NFT_BASE_CHAIN; + flags |= chain->flags & NFT_CHAIN_BASE; return nf_tables_updchain(&ctx, genmask, policy, flags); } -- cgit v1.2.3 From d0e2c7de92c7f2b3d355ad76b0bb9fc43d1beb87 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Jun 2020 19:21:36 +0200 Subject: netfilter: nf_tables: add NFT_CHAIN_BINDING This new chain flag specifies that: * the kernel dynamically allocates the chain name, if no chain name is specified. * If the immediate expression that refers to this chain is removed, then this bound chain (and its content) is destroyed. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++++- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 86 +++++++++++++++++++++++++++----- net/netfilter/nft_immediate.c | 51 +++++++++++++++++++ 4 files changed, 138 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6d1e7da6e00a..822c26766330 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -899,6 +899,8 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) return (void *)&rule->data[rule->dlen]; } +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule); + static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -944,7 +946,8 @@ struct nft_chain { struct nft_table *table; u64 handle; u32 use; - u8 flags:6, + u8 flags:5, + bound:1, genmask:2; char *name; @@ -989,6 +992,14 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); +static inline bool nft_chain_is_bound(struct nft_chain *chain) +{ + return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; +} + +void nft_chain_del(struct nft_chain *chain); +void nf_tables_chain_destroy(struct nft_ctx *ctx); + struct nft_stats { u64 bytes; u64 pkts; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2cf7cc3b50c1..e00b4ae6174e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -187,6 +187,7 @@ enum nft_table_attributes { enum nft_chain_flags { NFT_CHAIN_BASE = (1 << 0), NFT_CHAIN_HW_OFFLOAD = (1 << 1), + NFT_CHAIN_BINDING = (1 << 2), }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7cb9c07802b..b8a970dad213 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1056,6 +1056,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delrule_by_chain(ctx); @@ -1098,6 +1101,9 @@ static int nft_flush_table(struct nft_ctx *ctx) if (!nft_is_active_next(ctx->net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; + ctx->chain = chain; err = nft_delchain(ctx); @@ -1413,13 +1419,12 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, lockdep_commit_lock_is_held(net)); if (nft_dump_stats(skb, stats)) goto nla_put_failure; - - if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) && - nla_put_be32(skb, NFTA_CHAIN_FLAGS, - htonl(NFT_CHAIN_HW_OFFLOAD))) - goto nla_put_failure; } + if (chain->flags && + nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; @@ -1621,7 +1626,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->rules_next); } -static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; struct nft_hook *hook, *next; @@ -1928,6 +1933,8 @@ static int nft_chain_add(struct nft_table *table, struct nft_chain *chain) return 0; } +static u64 chain_id; + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, u32 flags) { @@ -1936,6 +1943,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_base_chain *basechain; struct nft_stats __percpu *stats; struct net *net = ctx->net; + char name[NFT_NAME_MAXLEN]; struct nft_trans *trans; struct nft_chain *chain; struct nft_rule **rules; @@ -1947,6 +1955,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_HOOK]) { struct nft_chain_hook hook; + if (flags & NFT_CHAIN_BINDING) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(net, nla, &hook, family, true); if (err < 0) return err; @@ -1976,16 +1987,33 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return err; } } else { + if (flags & NFT_CHAIN_BASE) + return -EINVAL; + if (flags & NFT_CHAIN_HW_OFFLOAD) + return -EOPNOTSUPP; + chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return -ENOMEM; + + chain->flags = flags; } ctx->chain = chain; INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + + if (nla[NFTA_CHAIN_NAME]) { + chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL); + } else { + if (!(flags & NFT_CHAIN_BINDING)) + return -EINVAL; + + snprintf(name, sizeof(name), "__chain%llu", ++chain_id); + chain->name = kstrdup(name, GFP_KERNEL); + } + if (!chain->name) { err = -ENOMEM; goto err1; @@ -2976,8 +3004,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, kfree(rule); } -static void nf_tables_rule_release(const struct nft_ctx *ctx, - struct nft_rule *rule) +void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -3075,6 +3102,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; + } else if (nla[NFTA_RULE_CHAIN_ID]) { chain = nft_chain_lookup_byid(net, nla[NFTA_RULE_CHAIN_ID]); if (IS_ERR(chain)) { @@ -3294,6 +3324,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); } + if (nft_chain_is_bound(chain)) + return -EOPNOTSUPP; } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -5330,11 +5362,24 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, */ void nft_data_hold(const struct nft_data *data, enum nft_data_types type) { + struct nft_chain *chain; + struct nft_rule *rule; + if (type == NFT_DATA_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use++; + chain = data->verdict.chain; + chain->use++; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use++; + list_for_each_entry(rule, &chain->rules, list) + chain->use++; + + nft_chain_add(chain->table, chain); break; } } @@ -7474,7 +7519,7 @@ static void nft_obj_del(struct nft_object *obj) list_del_rcu(&obj->list); } -static void nft_chain_del(struct nft_chain *chain) +void nft_chain_del(struct nft_chain *chain) { struct nft_table *table = chain->table; @@ -7825,6 +7870,10 @@ static int __nf_tables_abort(struct net *net, bool autoload) kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); } else { + if (nft_chain_is_bound(trans->ctx.chain)) { + nft_trans_destroy(trans); + break; + } trans->ctx.table->use--; nft_chain_del(trans->ctx.chain); nf_tables_unregister_hook(trans->ctx.net, @@ -8321,10 +8370,23 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, static void nft_verdict_uninit(const struct nft_data *data) { + struct nft_chain *chain; + struct nft_rule *rule; + switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: - data->verdict.chain->use--; + chain = data->verdict.chain; + chain->use--; + + if (!nft_chain_is_bound(chain)) + break; + + chain->table->use--; + list_for_each_entry(rule, &chain->rules, list) + chain->use--; + + nft_chain_del(chain); break; } } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index c7f0ef73d939..9e556638bb32 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -54,6 +54,23 @@ static int nft_immediate_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (priv->dreg == NFT_REG_VERDICT) { + struct nft_chain *chain = priv->data.verdict.chain; + + switch (priv->data.verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + if (nft_chain_is_bound(chain)) { + err = -EBUSY; + goto err1; + } + chain->bound = true; + break; + default: + break; + } + } + return 0; err1: @@ -81,6 +98,39 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx, return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } +static void nft_immediate_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data = &priv->data; + struct nft_ctx chain_ctx; + struct nft_chain *chain; + struct nft_rule *rule; + + if (priv->dreg != NFT_REG_VERDICT) + return; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + chain = data->verdict.chain; + + if (!nft_chain_is_bound(chain)) + break; + + chain_ctx = *ctx; + chain_ctx.chain = chain; + + list_for_each_entry(rule, &chain->rules, list) + nf_tables_rule_release(&chain_ctx, rule); + + nf_tables_chain_destroy(&chain_ctx); + break; + default: + break; + } +} + static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); @@ -170,6 +220,7 @@ static const struct nft_expr_ops nft_imm_ops = { .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, + .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, -- cgit v1.2.3 From c1f79a2eefdcc0aef5d7a911c27a3f75f1936ecd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Jul 2020 02:51:28 +0200 Subject: netfilter: nf_tables: reject unsupported chain flags Bail out if userspace sends unsupported chain flags. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e00b4ae6174e..42f351c1f5c5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -189,6 +189,9 @@ enum nft_chain_flags { NFT_CHAIN_HW_OFFLOAD = (1 << 1), NFT_CHAIN_BINDING = (1 << 2), }; +#define NFT_CHAIN_FLAGS (NFT_CHAIN_BASE | \ + NFT_CHAIN_HW_OFFLOAD | \ + NFT_CHAIN_BINDING) /** * enum nft_chain_attributes - nf_tables chain netlink attributes diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b8a970dad213..f96785586f64 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2285,6 +2285,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, else if (chain) flags = chain->flags; + if (flags & ~NFT_CHAIN_FLAGS) + return -EOPNOTSUPP; + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { -- cgit v1.2.3 From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 6 Jul 2020 16:01:25 -0700 Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook Sometimes it's handy to know when the socket gets freed. In particular, we'd like to try to use a smarter allocation of ports for bpf_bind and explore the possibility of limiting the number of SOCK_DGRAM sockets the process can have. Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on inet socket release. It triggers only for userspace sockets (not in-kernel ones) and therefore has the same semantics as the existing BPF_CGROUP_INET_SOCK_CREATE. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com --- include/linux/bpf-cgroup.h | 4 ++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 3 +++ net/core/filter.c | 1 + net/ipv4/af_inet.c | 3 +++ 5 files changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c66c545e161a..2c6f26670acc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -210,6 +210,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_RELEASE) + #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) @@ -401,6 +404,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index da9bf35a26f8..548a749aebb3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -226,6 +226,7 @@ enum bpf_attach_type { BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..156f51ffada2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1981,6 +1981,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; @@ -2779,6 +2780,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; @@ -2929,6 +2931,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: diff --git a/net/core/filter.c b/net/core/filter.c index c5e696e6c315..ddcc0d6209e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6890,6 +6890,7 @@ static bool __sock_filter_check_attach_type(int off, case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ea6ed6d487ed..ff141d630bdf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -411,6 +411,9 @@ int inet_release(struct socket *sock) if (sk) { long timeout; + if (!sk->sk_kern_sock) + BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); + /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); -- cgit v1.2.3 From 065e0d42a0a728d7f6c2aec7c9f3e5dc7b715394 Mon Sep 17 00:00:00 2001 From: Meir Lichtinger Date: Mon, 6 Jul 2020 20:42:32 -0700 Subject: ethtool: Add support for 100Gbps per lane link modes Define 100G, 200G and 400G link modes using 100Gbps per lane LR, ER and FR are defined as a single link mode because they are using same technology and by design are fully interoperable. EEPROM content indicates if the module is LR, ER, or FR, and the user space ethtool decoder is planned to support decoding these modes in the EEPROM. Signed-off-by: Meir Lichtinger CC: Andrew Lunn Reviewed-by: Aya Levin Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 17 ++++++++++++++++- include/uapi/linux/ethtool.h | 15 +++++++++++++++ net/ethtool/common.c | 15 +++++++++++++++ net/ethtool/linkmodes.c | 15 +++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 46bd68e9ecfa..ff8e14b01eeb 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 75, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 90, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -78,12 +78,22 @@ static const struct phy_setting settings[] = { PHY_SETTING( 400000, FULL, 400000baseLR8_ER8_FR8_Full ), PHY_SETTING( 400000, FULL, 400000baseDR8_Full ), PHY_SETTING( 400000, FULL, 400000baseSR8_Full ), + PHY_SETTING( 400000, FULL, 400000baseCR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseKR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseLR4_ER4_FR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseDR4_Full ), + PHY_SETTING( 400000, FULL, 400000baseSR4_Full ), /* 200G */ PHY_SETTING( 200000, FULL, 200000baseCR4_Full ), PHY_SETTING( 200000, FULL, 200000baseKR4_Full ), PHY_SETTING( 200000, FULL, 200000baseLR4_ER4_FR4_Full ), PHY_SETTING( 200000, FULL, 200000baseDR4_Full ), PHY_SETTING( 200000, FULL, 200000baseSR4_Full ), + PHY_SETTING( 200000, FULL, 200000baseCR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseKR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseLR2_ER2_FR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseDR2_Full ), + PHY_SETTING( 200000, FULL, 200000baseSR2_Full ), /* 100G */ PHY_SETTING( 100000, FULL, 100000baseCR4_Full ), PHY_SETTING( 100000, FULL, 100000baseKR4_Full ), @@ -94,6 +104,11 @@ static const struct phy_setting settings[] = { PHY_SETTING( 100000, FULL, 100000baseLR2_ER2_FR2_Full ), PHY_SETTING( 100000, FULL, 100000baseDR2_Full ), PHY_SETTING( 100000, FULL, 100000baseSR2_Full ), + PHY_SETTING( 100000, FULL, 100000baseCR_Full ), + PHY_SETTING( 100000, FULL, 100000baseKR_Full ), + PHY_SETTING( 100000, FULL, 100000baseLR_ER_FR_Full ), + PHY_SETTING( 100000, FULL, 100000baseDR_Full ), + PHY_SETTING( 100000, FULL, 100000baseSR_Full ), /* 56G */ PHY_SETTING( 56000, FULL, 56000baseCR4_Full ), PHY_SETTING( 56000, FULL, 56000baseKR4_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1413538ef30..60856e0f9618 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1600,6 +1600,21 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74, + ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75, + ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76, + ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77, + ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78, + ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79, + ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80, + ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81, + ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82, + ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83, + ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84, + ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85, + ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86, + ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, + ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, + ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ce4dbae5a943..c54166713797 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -176,6 +176,21 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, DR8, Full), __DEFINE_LINK_MODE_NAME(400000, CR8, Full), __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"), + __DEFINE_LINK_MODE_NAME(100000, KR, Full), + __DEFINE_LINK_MODE_NAME(100000, SR, Full), + __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_NAME(100000, DR, Full), + __DEFINE_LINK_MODE_NAME(100000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR2, Full), + __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_NAME(200000, DR2, Full), + __DEFINE_LINK_MODE_NAME(200000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR4, Full), + __DEFINE_LINK_MODE_NAME(400000, SR4, Full), + __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_NAME(400000, DR4, Full), + __DEFINE_LINK_MODE_NAME(400000, CR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fd4f3e58c6f6..317a93129551 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -257,6 +257,21 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), }; static const struct nla_policy -- cgit v1.2.3 From 3f935c75eb52dd968351dba824adf466fb9c9429 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:39 +0200 Subject: inet_diag: support for wider protocol numbers After commit bf9765145b85 ("sock: Make sk_protocol a 16-bit value") the current size of 'sdiag_protocol' is not sufficient to represent the possible protocol values. This change introduces a new inet diag request attribute to let user space specify the relevant protocol number using u32 values. The attribute is parsed by inet diag core on get/dump command and the extended protocol value, if available, is preferred to 'sdiag_protocol' to lookup the diag handler. The parse attributed are exposed to all the diag handlers via the cb->data. Note that inet_diag_dump_one_icsk() is left unmodified, as it will not be used by protocol using the extended attribute. Suggested-by: David S. Miller Co-developed-by: Christoph Paasch Signed-off-by: Christoph Paasch Acked-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/core/sock.c | 1 + net/ipv4/inet_diag.c | 65 +++++++++++++++++++++++++++++++----------- 3 files changed, 50 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index e6f183ee8417..5ba122c1949a 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -65,6 +65,7 @@ enum { INET_DIAG_REQ_NONE, INET_DIAG_REQ_BYTECODE, INET_DIAG_REQ_SK_BPF_STORAGES, + INET_DIAG_REQ_PROTOCOL, __INET_DIAG_REQ_MAX, }; diff --git a/net/core/sock.c b/net/core/sock.c index f5b5fdd61c88..de26fe4ea19f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3566,6 +3566,7 @@ int sock_load_diag_module(int family, int protocol) #ifdef CONFIG_INET if (family == AF_INET && protocol != IPPROTO_RAW && + protocol < MAX_INET_PROTOS && !rcu_access_pointer(inet_protos[protocol])) return -ENOENT; #endif diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 125f4f8a36b4..4a98dd736270 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex); static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { + if (proto < 0 || proto >= IPPROTO_MAX) { + mutex_lock(&inet_diag_table_mutex); + return ERR_PTR(-ENOENT); + } + if (!inet_diag_table[proto]) sock_load_diag_module(AF_INET, proto); @@ -181,6 +186,28 @@ errout: } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); +static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr **req_nlas) +{ + struct nlattr *nla; + int remaining; + + nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { + int type = nla_type(nla); + + if (type < __INET_DIAG_REQ_MAX) + req_nlas[type] = nla; + } +} + +static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, + const struct inet_diag_dump_data *data) +{ + if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) + return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); + return req->sdiag_protocol; +} + #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, @@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, void *info = NULL; cb_data = cb->data; - handler = inet_diag_table[req->sdiag_protocol]; + handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; BUG_ON(!handler); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, + int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; - int err; + struct inet_diag_dump_data dump_data; + int err, protocol; - handler = inet_diag_lock_handler(req->sdiag_protocol); + memset(&dump_data, 0, sizeof(dump_data)); + inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); + protocol = inet_diag_get_protocol(req, &dump_data); + + handler = inet_diag_lock_handler(protocol); if (IS_ERR(handler)) { err = PTR_ERR(handler); } else if (cmd == SOCK_DIAG_BY_FAMILY) { - struct inet_diag_dump_data empty_dump_data = {}; struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, - .data = &empty_dump_data, + .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { @@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { + struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; - int err = 0; + int protocol, err = 0; + + protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; - handler = inet_diag_lock_handler(r->sdiag_protocol); + handler = inet_diag_lock_handler(protocol); if (!IS_ERR(handler)) handler->dump(skb, cb, r); else @@ -1139,19 +1174,13 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; - int rem, err; + int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; - nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), rem) { - int type = nla_type(nla); - - if (type < __INET_DIAG_REQ_MAX) - cb_data->req_nlas[type] = nla; - } + inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); nla = cb_data->inet_diag_nla_bc; if (nla) { @@ -1237,7 +1266,8 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb, req.idiag_states = rc->idiag_states; req.id = rc->id; - return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req); + return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, + sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -1279,7 +1309,8 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) return netlink_dump_start(net->diag_nlsk, skb, h, &c); } - return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h)); + return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, + nlmsg_data(h)); } static -- cgit v1.2.3 From ac3b45f6095452a9731f8825be1513d326dbfa15 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 9 Jul 2020 15:12:41 +0200 Subject: mptcp: add MPTCP socket diag interface exposes basic inet socket attribute, plus some MPTCP socket fields comprising PM status and MPTCP-level sequence numbers. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 17 +++++ net/mptcp/Kconfig | 4 ++ net/mptcp/Makefile | 2 + net/mptcp/mptcp_diag.c | 169 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 net/mptcp/mptcp_diag.c (limited to 'include/uapi') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5f2c77082d9e..9762660df741 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -86,4 +86,21 @@ enum { __MPTCP_PM_CMD_AFTER_LAST }; +#define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) +#define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) + +struct mptcp_info { + __u8 mptcpi_subflows; + __u8 mptcpi_add_addr_signal; + __u8 mptcpi_add_addr_accepted; + __u8 mptcpi_subflows_max; + __u8 mptcpi_add_addr_signal_max; + __u8 mptcpi_add_addr_accepted_max; + __u32 mptcpi_flags; + __u32 mptcpi_token; + __u64 mptcpi_write_seq; + __u64 mptcpi_snd_una; + __u64 mptcpi_rcv_nxt; +}; + #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index af84fce70bb0..698bc3525160 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -13,6 +13,10 @@ config MPTCP if MPTCP +config INET_MPTCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config MPTCP_IPV6 bool "MPTCP: IPv6 support for Multipath TCP" select IPV6 diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index c53f9b845523..2360cbd27d59 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -4,6 +4,8 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o +obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o + mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..5f390a97f556 --- /dev/null +++ b/net/mptcp/mptcp_diag.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2020 Red Hat + * + * Author: Paolo Abeni + */ + +#include +#include +#include +#include +#include +#include "protocol.h" + +static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc, bool net_admin) +{ + if (!inet_diag_bc_sk(bc, sk)) + return 0; + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, + net_admin); +} + +static int mptcp_diag_dump_one(struct netlink_callback *cb, + const struct inet_diag_req_v2 *req) +{ + struct sk_buff *in_skb = cb->skb; + struct mptcp_sock *msk = NULL; + struct sk_buff *rep; + int err = -ENOENT; + struct net *net; + struct sock *sk; + + net = sock_net(in_skb->sk); + msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + if (!msk) + goto out_nosk; + + err = -ENOMEM; + sk = (struct sock *)msk; + rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct mptcp_info)) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, + GFP_KERNEL); + if (!rep) + goto out; + + err = inet_sk_diag_fill(sk, inet_csk(sk), rep, cb, req, 0, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + sock_put(sk); + +out_nosk: + return err; +} + +static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r) +{ + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct net *net = sock_net(skb->sk); + struct inet_diag_dump_data *cb_data; + struct mptcp_sock *msk; + struct nlattr *bc; + + cb_data = cb->data; + bc = cb_data->inet_diag_nla_bc; + + while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != + NULL) { + struct inet_sock *inet = (struct inet_sock *)msk; + struct sock *sk = (struct sock *)msk; + int ret = 0; + + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); +next: + sock_put(sk); + if (ret < 0) { + /* will retry on the same position */ + cb->args[1]--; + break; + } + cond_resched(); + } +} + +static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_info *info = _info; + u32 flags = 0; + bool slow; + u8 val; + + r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_wqueue = sk_wmem_alloc_get(sk); + if (!info) + return; + + slow = lock_sock_fast(sk); + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max); + val = READ_ONCE(msk->pm.add_addr_signal_max); + info->mptcpi_add_addr_signal_max = val; + val = READ_ONCE(msk->pm.add_addr_accept_max); + info->mptcpi_add_addr_accepted_max = val; + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = atomic64_read(&msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + unlock_sock_fast(sk, slow); +} + +static const struct inet_diag_handler mptcp_diag_handler = { + .dump = mptcp_diag_dump, + .dump_one = mptcp_diag_dump_one, + .idiag_get_info = mptcp_diag_get_info, + .idiag_type = IPPROTO_MPTCP, + .idiag_info_size = sizeof(struct mptcp_info), +}; + +static int __init mptcp_diag_init(void) +{ + return inet_diag_register(&mptcp_diag_handler); +} + +static void __exit mptcp_diag_exit(void) +{ + inet_diag_unregister(&mptcp_diag_handler); +} + +module_init(mptcp_diag_init); +module_exit(mptcp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); -- cgit v1.2.3 From a21cf0a8330bba60e44ca6c99e1591042f336ff5 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 9 Jul 2020 16:18:18 +0300 Subject: devlink: Add a new devlink port lanes attribute and pass to netlink Add a new devlink port attribute that indicates the port's number of lanes. Drivers are expected to set it via devlink_port_attrs_set(), before registering the port. The attribute is not passed to user space in case the number of lanes is invalid (0). Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 4 ++++ 4 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index f44cb1a537f3..6cde196f6b70 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2134,6 +2134,7 @@ static int __mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, int err; attrs.split = split; + attrs.lanes = lanes; attrs.flavour = flavour; attrs.phys.port_number = port_number; attrs.phys.split_subport_number = split_port_subnumber; diff --git a/include/net/devlink.h b/include/net/devlink.h index 8f9db991192d..91a9f8770d08 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -68,10 +68,12 @@ struct devlink_port_pci_vf_attrs { * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port + * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL */ struct devlink_port_attrs { u8 split:1; + u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; union { diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 87c83a82991b..f741ab8d9cf0 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -455,6 +455,8 @@ enum devlink_attr { DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ + DEVLINK_ATTR_PORT_LANES, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 266936c38357..7f26d1054974 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -530,6 +530,10 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (!devlink_port->attrs_set) return 0; + if (attrs->lanes) { + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) + return -EMSGSIZE; + } if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { -- cgit v1.2.3 From a0f49b54865273c895be3826d6d59cbc5ad725c2 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Thu, 9 Jul 2020 16:18:20 +0300 Subject: devlink: Add a new devlink port split ability attribute and pass to netlink Add a new attribute that indicates the split ability of devlink port. Drivers are expected to set it via devlink_port_attrs_set(), before registering the port. Signed-off-by: Danielle Ratson Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 1 + include/net/devlink.h | 4 +++- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 3 +++ 5 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index f85f5d88d331..8b3791d73c99 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2135,6 +2135,7 @@ static int __mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port, attrs.split = split; attrs.lanes = lanes; + attrs.splittable = splittable; attrs.flavour = flavour; attrs.phys.port_number = port_number; attrs.phys.split_subport_number = split_port_subnumber; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 71f4e624b3db..b6a10565309a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -367,6 +367,7 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) return ret; attrs.split = eth_port.is_split; + attrs.splittable = !attrs.split; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = eth_port.label_port; attrs.phys.split_subport_number = eth_port.label_subport; diff --git a/include/net/devlink.h b/include/net/devlink.h index 91a9f8770d08..746bed538664 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -68,11 +68,13 @@ struct devlink_port_pci_vf_attrs { * struct devlink_port_attrs - devlink port object * @flavour: flavour of the port * @split: indicates if this is split port + * @splittable: indicates if the port can be split. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL */ struct devlink_port_attrs { - u8 split:1; + u8 split:1, + splittable:1; u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index f741ab8d9cf0..cfef4245ea5a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -456,6 +456,7 @@ enum devlink_attr { DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, /* string */ DEVLINK_ATTR_PORT_LANES, /* u32 */ + DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 7f26d1054974..94c797b74378 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -534,6 +534,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) + return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { @@ -7547,6 +7549,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; + WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); -- cgit v1.2.3 From c7d759eb7b12f91a25f4d3cd03ff5209046ddfc2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 Jul 2020 17:42:47 -0700 Subject: ethtool: add tunnel info interface Add an interface to report offloaded UDP ports via ethtool netlink. Now that core takes care of tracking which UDP tunnel ports the NICs are aware of we can quite easily export this information out to user space. The responsibility of writing the netlink dumps is split between ethtool code and udp_tunnel_nic.c - since udp_tunnel module may not always be loaded, yet we should always report the capabilities of the NIC. $ ethtool --show-tunnels eth0 Tunnel information for eth0: UDP port table 0: Size: 4 Types: vxlan No entries UDP port table 1: Size: 4 Types: geneve, vxlan-gpe Entries (1): port 1230, vxlan-gpe v4: - back to v2, build fix is now directly in udp_tunnel.h v3: - don't compile ETHTOOL_MSG_TUNNEL_INFO_GET in if CONFIG_INET not set. v2: - fix string set count, - reorder enums in the uAPI, - fix type of ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES to bitset in docs and comments. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 33 ++++ include/net/udp_tunnel.h | 21 +++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 55 ++++++ net/ethtool/Makefile | 3 +- net/ethtool/common.c | 9 + net/ethtool/common.h | 1 + net/ethtool/netlink.c | 12 ++ net/ethtool/netlink.h | 4 + net/ethtool/strset.c | 5 + net/ethtool/tunnels.c | 259 +++++++++++++++++++++++++++ net/ipv4/udp_tunnel_nic.c | 69 +++++++ 12 files changed, 472 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/tunnels.c (limited to 'include/uapi') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 459a0d11cfde..7d75f1e32152 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1230,6 +1230,39 @@ used to report the amplitude of the reflection for a given pair. | | | ``ETHTOOL_A_CABLE_AMPLITUDE_mV`` | s16 | Reflection amplitude | +-+-+-----------------------------------------+--------+----------------------+ +TUNNEL_INFO +=========== + +Gets information about the tunnel state NIC is aware of. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_TUNNEL_INFO_HEADER`` nested request header + ===================================== ====== ========================== + +Kernel response contents: + + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_TUNNEL_INFO_HEADER`` | nested | reply header | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_TUNNEL_INFO_UDP_PORTS`` | nested | all UDP port tables | + +-+-------------------------------------------+--------+---------------------+ + | | ``ETHTOOL_A_TUNNEL_UDP_TABLE`` | nested | one UDP port table | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE`` | u32 | max size of the | + | | | | | table | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` | bitset | tunnel types which | + | | | | | table can hold | + +-+-+-----------------------------------------+--------+---------------------+ + | | | ``ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY`` | nested | offloaded UDP port | + +-+-+-+---------------------------------------+--------+---------------------+ + | | | | ``ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT`` | be16 | UDP port | + +-+-+-+---------------------------------------+--------+---------------------+ + | | | | ``ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE`` | u32 | tunnel type | + +-+-+-+---------------------------------------+--------+---------------------+ + Request translation =================== diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index ee34619e4cfa..dd20ce99740c 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -255,6 +255,10 @@ struct udp_tunnel_nic_ops { void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti); void (*reset_ntf)(struct net_device *dev); + + size_t (*dump_size)(struct net_device *dev, unsigned int table); + int (*dump_write)(struct net_device *dev, unsigned int table, + struct sk_buff *skb); }; #ifdef CONFIG_INET @@ -318,4 +322,21 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) if (udp_tunnel_nic_ops) udp_tunnel_nic_ops->reset_ntf(dev); } + +static inline size_t +udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + if (!udp_tunnel_nic_ops) + return 0; + return udp_tunnel_nic_ops->dump_size(dev, table); +} + +static inline int +udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + if (!udp_tunnel_nic_ops) + return 0; + return udp_tunnel_nic_ops->dump_write(dev, table, skb); +} #endif diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 60856e0f9618..b4f2d134e713 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -669,6 +669,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags * @ETH_SS_TS_TX_TYPES: timestamping Tx types * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters + * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -686,6 +687,7 @@ enum ethtool_stringset { ETH_SS_SOF_TIMESTAMPING, ETH_SS_TS_TX_TYPES, ETH_SS_TS_RX_FILTERS, + ETH_SS_UDP_TUNNEL_TYPES, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index c12ce4df4b6b..5dcd24cb33ea 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -41,6 +41,7 @@ enum { ETHTOOL_MSG_TSINFO_GET, ETHTOOL_MSG_CABLE_TEST_ACT, ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + ETHTOOL_MSG_TUNNEL_INFO_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -556,6 +557,60 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 }; +/* TUNNEL INFO */ + +enum { + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, + ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, + + __ETHTOOL_UDP_TUNNEL_TYPE_CNT +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, + ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_CNT, + ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_INFO_UNSPEC, + ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_INFO_CNT, + ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0c2b94f20499..7a849ff22dad 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ + tunnels.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index c54166713797..ed19573fccd7 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -272,6 +273,14 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", + [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", + [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe", +}; +static_assert(ARRAY_SIZE(udp_tunnel_type_names) == + __ETHTOOL_UDP_TUNNEL_TYPE_CNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b83bef38368c..3d9251c95a8b 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -28,6 +28,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 88fd07f47040..fb9d096faaa4 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,6 +181,12 @@ err: return NULL; } +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) +{ + return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + ðtool_genl_family, 0, cmd); +} + void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, @@ -849,6 +855,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, }, + { + .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, + .doit = ethnl_tunnel_info_doit, + .start = ethnl_tunnel_info_start, + .dumpit = ethnl_tunnel_info_dumpit, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9a96b6e90dc2..e2085005caac 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,7 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); @@ -361,5 +362,8 @@ int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_tunnel_info_start(struct netlink_callback *cb); +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 773634b6b048..82707b662fe4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_UDP_TUNNEL_TYPES] = { + .per_dev = false, + .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + .strings = udp_tunnel_type_names, + }, }; struct strset_req_info { diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c new file mode 100644 index 000000000000..6b89255f1231 --- /dev/null +++ b/net/ethtool/tunnels.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include "bitset.h" +#include "common.h" +#include "netlink.h" + +static const struct nla_policy +ethtool_tunnel_info_policy[ETHTOOL_A_TUNNEL_INFO_MAX + 1] = { + [ETHTOOL_A_TUNNEL_INFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_TUNNEL_INFO_HEADER] = { .type = NLA_NESTED }, +}; + +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE)); +static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE == + ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE)); + +static ssize_t +ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base, + struct netlink_ext_ack *extack) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + unsigned int i; + size_t size; + int ret; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) { + NL_SET_ERR_MSG(extack, + "device does not report tunnel offload info"); + return -EOPNOTSUPP; + } + + size = nla_total_size(0); /* _INFO_UDP_PORTS */ + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + return size; + + size += nla_total_size(0); /* _UDP_TABLE */ + size += nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */ + ret = ethnl_bitset32_size(&info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact); + if (ret < 0) + return ret; + size += ret; + + size += udp_tunnel_nic_dump_size(req_base->dev, i); + } + + return size; +} + +static int +ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, + struct sk_buff *skb) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct udp_tunnel_nic_info *info; + struct nlattr *ports, *table; + unsigned int i; + + info = req_base->dev->udp_tunnel_nic_info; + if (!info) + return -EOPNOTSUPP; + + ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS); + if (!ports) + return -EMSGSIZE; + + for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + break; + + table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE); + if (!table) + goto err_cancel_ports; + + if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + info->tables[i].n_entries)) + goto err_cancel_table; + + if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + &info->tables[i].tunnel_types, NULL, + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + udp_tunnel_type_names, compact)) + goto err_cancel_table; + + if (udp_tunnel_nic_dump_write(req_base->dev, i, skb)) + goto err_cancel_table; + + nla_nest_end(skb, table); + } + + nla_nest_end(skb, ports); + + return 0; + +err_cancel_table: + nla_nest_cancel(skb, table); +err_cancel_ports: + nla_nest_cancel(skb, ports); + return -EMSGSIZE; +} + +static int +ethnl_tunnel_info_req_parse(struct ethnl_req_info *req_info, + const struct nlmsghdr *nlhdr, struct net *net, + struct netlink_ext_ack *extack, bool require_dev) +{ + struct nlattr *tb[ETHTOOL_A_TUNNEL_INFO_MAX + 1]; + int ret; + + ret = nlmsg_parse(nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_TUNNEL_INFO_MAX, + ethtool_tunnel_info_policy, extack); + if (ret < 0) + return ret; + + return ethnl_parse_header_dev_get(req_info, + tb[ETHTOOL_A_TUNNEL_INFO_HEADER], + net, extack, require_dev); +} + +int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ret = ethnl_tunnel_info_req_parse(&req_info, info->nlhdr, + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + rtnl_lock(); + ret = ethnl_tunnel_info_reply_size(&req_info, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + reply_len = ret + ethnl_reply_header_size(); + + rskb = ethnl_reply_init(reply_len, req_info.dev, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_A_TUNNEL_INFO_HEADER, + info, &reply_payload); + if (!rskb) { + ret = -ENOMEM; + goto err_unlock_rtnl; + } + + ret = ethnl_tunnel_info_fill_reply(&req_info, rskb); + if (ret) + goto err_free_msg; + rtnl_unlock(); + dev_put(req_info.dev); + genlmsg_end(rskb, reply_payload); + + return genlmsg_reply(rskb, info); + +err_free_msg: + nlmsg_free(rskb); +err_unlock_rtnl: + rtnl_unlock(); + dev_put(req_info.dev); + return ret; +} + +struct ethnl_tunnel_info_dump_ctx { + struct ethnl_req_info req_info; + int pos_hash; + int pos_idx; +}; + +int ethnl_tunnel_info_start(struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + memset(ctx, 0, sizeof(*ctx)); + + ret = ethnl_tunnel_info_req_parse(&ctx->req_info, cb->nlh, + sock_net(cb->skb->sk), cb->extack, + false); + if (ctx->req_info.dev) { + dev_put(ctx->req_info.dev); + ctx->req_info.dev = NULL; + } + + return ret; +} + +int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + int s_idx = ctx->pos_idx; + int h, idx = 0; + int ret = 0; + void *ehdr; + + rtnl_lock(); + cb->seq = net->dev_base_seq; + for (h = ctx->pos_hash; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + struct hlist_head *head; + struct net_device *dev; + + head = &net->dev_index_head[h]; + idx = 0; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TUNNEL_INFO_GET); + if (!ehdr) { + ret = -EMSGSIZE; + goto out; + } + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TUNNEL_INFO_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + goto out; + } + + ctx->req_info.dev = dev; + ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb); + ctx->req_info.dev = NULL; + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + if (ret == -EOPNOTSUPP) + goto cont; + goto out; + } + genlmsg_end(skb, ehdr); +cont: + idx++; + } + } +out: + rtnl_unlock(); + + ctx->pos_hash = h; + ctx->pos_idx = idx; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + + if (ret == -EMSGSIZE && skb->len) + return skb->len; + return ret; +} diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index 056cfe0b770e..f0dbd9905a53 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. +#include #include #include #include @@ -72,6 +73,12 @@ udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) return entry->use_cnt == 0 && !entry->flags; } +static bool +udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); +} + static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { @@ -564,12 +571,74 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) __udp_tunnel_nic_device_sync(dev, utn); } +static size_t +__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int j; + size_t size; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + size = 0; + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + size += nla_total_size(0) + /* _TABLE_ENTRY */ + nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ + nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ + } + + return size; +} + +static int +__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, + struct sk_buff *skb) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + struct nlattr *nest; + unsigned int j; + + utn = dev->udp_tunnel_nic; + if (!utn) + return 0; + + for (j = 0; j < info->tables[table].n_entries; j++) { + if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) + continue; + + nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + + if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + utn->entries[table][j].port) || + nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, + ilog2(utn->entries[table][j].type))) + goto err_cancel; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, + .dump_size = __udp_tunnel_nic_dump_size, + .dump_write = __udp_tunnel_nic_dump_write, }; static void -- cgit v1.2.3 From 8aa5a33578e9685d06020bd10d1637557423e945 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Wed, 8 Jul 2020 07:28:33 +0000 Subject: xsk: Add new statistics It can be useful for the user to know the reason behind a dropped packet. Introduce new counters which track drops on the receive path caused by: 1. rx ring being full 2. fill ring being empty Also, on the tx path introduce a counter which tracks the number of times we attempt pull from the tx ring when it is empty. Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200708072835.4427-2-ciara.loftus@intel.com --- include/net/xdp_sock.h | 4 ++++ include/uapi/linux/if_xdp.h | 5 ++++- net/xdp/xsk.c | 36 +++++++++++++++++++++++++++++++----- net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 6 ++++++ tools/include/uapi/linux/if_xdp.h | 5 ++++- 6 files changed, 50 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 96bfc5f5f24e..c9d87cc40c11 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -69,7 +69,11 @@ struct xdp_sock { spinlock_t tx_completion_lock; /* Protects generic receive. */ spinlock_t rx_lock; + + /* Statistics */ u64 rx_dropped; + u64 rx_queue_full; + struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index be328c59389d..a78a8096f4ce 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -73,9 +73,12 @@ struct xdp_umem_reg { }; struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3700266229f6..26e3bba8c204 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -123,7 +123,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) addr = xp_get_handle(xskb); err = xskq_prod_reserve_desc(xs->rx, addr, len); if (err) { - xs->rx_dropped++; + xs->rx_queue_full++; return err; } @@ -274,8 +274,10 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { - if (!xskq_cons_peek_desc(xs->tx, desc, umem)) + if (!xskq_cons_peek_desc(xs->tx, desc, umem)) { + xs->tx->queue_empty_descs++; continue; + } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -387,6 +389,8 @@ static int xsk_generic_xmit(struct sock *sk) sent_frame = true; } + xs->tx->queue_empty_descs++; + out: if (sent_frame) sk->sk_write_space(sk); @@ -812,6 +816,12 @@ static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) ring->desc = offsetof(struct xdp_umem_ring, desc); } +struct xdp_statistics_v1 { + __u64 rx_dropped; + __u64 rx_invalid_descs; + __u64 tx_invalid_descs; +}; + static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -831,19 +841,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, case XDP_STATISTICS: { struct xdp_statistics stats; + bool extra_stats = true; + size_t stats_size; - if (len < sizeof(stats)) + if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; + } else if (len < sizeof(stats)) { + extra_stats = false; + stats_size = sizeof(struct xdp_statistics_v1); + } else { + stats_size = sizeof(stats); + } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; + if (extra_stats) { + stats.rx_ring_full = xs->rx_queue_full; + stats.rx_fill_ring_empty_descs = + xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); + } else { + stats.rx_dropped += xs->rx_queue_full; + } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); - if (copy_to_user(optval, &stats, sizeof(stats))) + if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; - if (put_user(sizeof(stats), optlen)) + if (put_user(stats_size, optlen)) return -EFAULT; return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 540ed75e4482..89cf3551d3e9 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -235,6 +235,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + pool->fq->queue_empty_descs++; xp_release(xskb); return NULL; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5b5d24d2dd37..bf42cfd74b89 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -38,6 +38,7 @@ struct xsk_queue { u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; + u64 queue_empty_descs; }; /* The structure of the shared state of the rings are the same as the @@ -354,6 +355,11 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } +static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) +{ + return q ? q->queue_empty_descs : 0; +} + struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index be328c59389d..a78a8096f4ce 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -73,9 +73,12 @@ struct xdp_umem_reg { }; struct xdp_statistics { - __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_dropped; /* Dropped for other reasons */ __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ }; struct xdp_options { -- cgit v1.2.3 From 0d80cb4612aa32dc0faa17fa3ab6f96f33e2b4a7 Mon Sep 17 00:00:00 2001 From: Ciara Loftus Date: Wed, 8 Jul 2020 07:28:35 +0000 Subject: xsk: Add xdp statistics to xsk_diag Add xdp statistics to the information dumped through the xsk_diag interface Signed-off-by: Ciara Loftus Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200708072835.4427-4-ciara.loftus@intel.com --- include/uapi/linux/xdp_diag.h | 11 +++++++++++ net/xdp/xsk_diag.c | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h index 78b2591a7782..66b9973b4f4c 100644 --- a/include/uapi/linux/xdp_diag.h +++ b/include/uapi/linux/xdp_diag.h @@ -30,6 +30,7 @@ struct xdp_diag_msg { #define XDP_SHOW_RING_CFG (1 << 1) #define XDP_SHOW_UMEM (1 << 2) #define XDP_SHOW_MEMINFO (1 << 3) +#define XDP_SHOW_STATS (1 << 4) enum { XDP_DIAG_NONE, @@ -41,6 +42,7 @@ enum { XDP_DIAG_UMEM_FILL_RING, XDP_DIAG_UMEM_COMPLETION_RING, XDP_DIAG_MEMINFO, + XDP_DIAG_STATS, __XDP_DIAG_MAX, }; @@ -69,4 +71,13 @@ struct xdp_diag_umem { __u32 refs; }; +struct xdp_diag_stats { + __u64 n_rx_dropped; + __u64 n_rx_invalid; + __u64 n_rx_full; + __u64 n_fill_ring_empty; + __u64 n_tx_invalid; + __u64 n_tx_ring_empty; +}; + #endif /* _LINUX_XDP_DIAG_H */ diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 0163b26aaf63..21e9c2d123ee 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -76,6 +76,19 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) return err; } +static int xsk_diag_put_stats(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_stats du = {}; + + du.n_rx_dropped = xs->rx_dropped; + du.n_rx_invalid = xskq_nb_invalid_descs(xs->rx); + du.n_rx_full = xs->rx_queue_full; + du.n_fill_ring_empty = xs->umem ? xskq_nb_queue_empty_descs(xs->umem->fq) : 0; + du.n_tx_invalid = xskq_nb_invalid_descs(xs->tx); + du.n_tx_ring_empty = xskq_nb_queue_empty_descs(xs->tx); + return nla_put(nlskb, XDP_DIAG_STATS, sizeof(du), &du); +} + static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, struct xdp_diag_req *req, struct user_namespace *user_ns, @@ -118,6 +131,10 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->xdiag_show & XDP_SHOW_STATS) && + xsk_diag_put_stats(xs, nlskb)) + goto out_nlmsg_trim; + mutex_unlock(&xs->mutex); nlmsg_end(nlskb, nlh); return 0; -- cgit v1.2.3 From ed757328c34015be4ec51861a90bb3bcc807ad58 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Mon, 13 Jul 2020 12:24:18 +0200 Subject: atm: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: David S. Miller --- drivers/atm/solos-pci.c | 2 +- include/uapi/linux/atmioc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c index c32f7dd9879a..b7646ae55942 100644 --- a/drivers/atm/solos-pci.c +++ b/drivers/atm/solos-pci.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the Solos PCI ADSL2+ card, designed to support Linux by - * Traverse Technologies -- http://www.traverse.com.au/ + * Traverse Technologies -- https://www.traverse.com.au/ * Xrio Limited -- http://www.xrio.com/ * * Copyright © 2008 Traverse Technologies diff --git a/include/uapi/linux/atmioc.h b/include/uapi/linux/atmioc.h index cd7655e40c77..a9030bcc8d56 100644 --- a/include/uapi/linux/atmioc.h +++ b/include/uapi/linux/atmioc.h @@ -5,7 +5,7 @@ /* - * See http://icawww1.epfl.ch/linux-atm/magic.html for the complete list of + * See https://icawww1.epfl.ch/linux-atm/magic.html for the complete list of * "magic" ioctl numbers. */ -- cgit v1.2.3 From 2801758391ba6b0c20e253b956355e1b15ad85a2 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:48 +0200 Subject: bridge: uapi: mrp: Extend MRP attributes for MRP interconnect Extend the existing MRP netlink attributes to allow to configure MRP Interconnect: IFLA_BRIDGE_MRP_IN_ROLE - the parameter type is br_mrp_in_role which contains the interconnect id, the ring id, the interconnect role(MIM or MIC) and the port ifindex that represents the interconnect port. IFLA_BRIDGE_MRP_IN_STATE - the parameter type is br_mrp_in_state which contains the interconnect id and the interconnect state. IFLA_BRIDGE_MRP_IN_TEST - the parameter type is br_mrp_start_in_test which contains the interconnect id, the interval at which to send MRP_InTest frames, how many test frames can be missed before declaring the interconnect ring open and the period which represents for how long to send MRP_InTest frames. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 53 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/mrp_bridge.h | 38 +++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c114c1c2bd53..d840a3e37a37 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -167,6 +167,9 @@ enum { IFLA_BRIDGE_MRP_RING_ROLE, IFLA_BRIDGE_MRP_START_TEST, IFLA_BRIDGE_MRP_INFO, + IFLA_BRIDGE_MRP_IN_ROLE, + IFLA_BRIDGE_MRP_IN_STATE, + IFLA_BRIDGE_MRP_START_IN_TEST, __IFLA_BRIDGE_MRP_MAX, }; @@ -245,6 +248,37 @@ enum { #define IFLA_BRIDGE_MRP_INFO_MAX (__IFLA_BRIDGE_MRP_INFO_MAX - 1) +enum { + IFLA_BRIDGE_MRP_IN_STATE_UNSPEC, + IFLA_BRIDGE_MRP_IN_STATE_IN_ID, + IFLA_BRIDGE_MRP_IN_STATE_STATE, + __IFLA_BRIDGE_MRP_IN_STATE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_STATE_MAX (__IFLA_BRIDGE_MRP_IN_STATE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC, + IFLA_BRIDGE_MRP_IN_ROLE_RING_ID, + IFLA_BRIDGE_MRP_IN_ROLE_IN_ID, + IFLA_BRIDGE_MRP_IN_ROLE_ROLE, + IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX, + __IFLA_BRIDGE_MRP_IN_ROLE_MAX, +}; + +#define IFLA_BRIDGE_MRP_IN_ROLE_MAX (__IFLA_BRIDGE_MRP_IN_ROLE_MAX - 1) + +enum { + IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC, + IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID, + IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS, + IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD, + __IFLA_BRIDGE_MRP_START_IN_TEST_MAX, +}; + +#define IFLA_BRIDGE_MRP_START_IN_TEST_MAX (__IFLA_BRIDGE_MRP_START_IN_TEST_MAX - 1) + struct br_mrp_instance { __u32 ring_id; __u32 p_ifindex; @@ -270,6 +304,25 @@ struct br_mrp_start_test { __u32 monitor; }; +struct br_mrp_in_state { + __u32 in_state; + __u16 in_id; +}; + +struct br_mrp_in_role { + __u32 ring_id; + __u32 in_role; + __u32 i_ifindex; + __u16 in_id; +}; + +struct br_mrp_start_in_test { + __u32 interval; + __u32 max_miss; + __u32 period; + __u16 in_id; +}; + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h index bee366540212..6aeb13ef0b1e 100644 --- a/include/uapi/linux/mrp_bridge.h +++ b/include/uapi/linux/mrp_bridge.h @@ -21,11 +21,22 @@ enum br_mrp_ring_role_type { BR_MRP_RING_ROLE_MRA, }; +enum br_mrp_in_role_type { + BR_MRP_IN_ROLE_DISABLED, + BR_MRP_IN_ROLE_MIC, + BR_MRP_IN_ROLE_MIM, +}; + enum br_mrp_ring_state_type { BR_MRP_RING_STATE_OPEN, BR_MRP_RING_STATE_CLOSED, }; +enum br_mrp_in_state_type { + BR_MRP_IN_STATE_OPEN, + BR_MRP_IN_STATE_CLOSED, +}; + enum br_mrp_port_state_type { BR_MRP_PORT_STATE_DISABLED, BR_MRP_PORT_STATE_BLOCKED, @@ -36,6 +47,7 @@ enum br_mrp_port_state_type { enum br_mrp_port_role_type { BR_MRP_PORT_ROLE_PRIMARY, BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_INTER, }; enum br_mrp_tlv_header_type { @@ -45,6 +57,10 @@ enum br_mrp_tlv_header_type { BR_MRP_TLV_HEADER_RING_TOPO = 0x3, BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, + BR_MRP_TLV_HEADER_IN_TEST = 0x6, + BR_MRP_TLV_HEADER_IN_TOPO = 0x7, + BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8, + BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9, BR_MRP_TLV_HEADER_OPTION = 0x7f, }; @@ -118,4 +134,26 @@ struct br_mrp_oui_hdr { __u8 oui[MRP_OUI_LENGTH]; }; +struct br_mrp_in_test_hdr { + __be16 id; + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 state; + __be16 transitions; + __be32 timestamp; +}; + +struct br_mrp_in_topo_hdr { + __u8 sa[ETH_ALEN]; + __be16 id; + __be16 interval; +}; + +struct br_mrp_in_link_hdr { + __u8 sa[ETH_ALEN]; + __be16 port_role; + __be16 id; + __be16 interval; +}; + #endif -- cgit v1.2.3 From 559139cb0405d38816e5e725adee9000db993235 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:56 +0200 Subject: bridge: uapi: mrp: Extend MRP_INFO attributes for interconnect status Extend the existing MRP_INFO to return status of MRP interconnect. In case there is no MRP interconnect on the node then the role will be disabled so the other attributes can be ignored. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index d840a3e37a37..c1227aecd38f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -243,6 +243,11 @@ enum { IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR, + IFLA_BRIDGE_MRP_INFO_I_IFINDEX, + IFLA_BRIDGE_MRP_INFO_IN_STATE, + IFLA_BRIDGE_MRP_INFO_IN_ROLE, + IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL, + IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS, __IFLA_BRIDGE_MRP_INFO_MAX, }; -- cgit v1.2.3 From ffb3adba64801f70c472303c9e386eb5eaec193d Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 14 Jul 2020 09:34:58 +0200 Subject: net: bridge: Add port attribute IFLA_BRPORT_MRP_IN_OPEN This patch adds a new port attribute, IFLA_BRPORT_MRP_IN_OPEN, which allows to notify the userspace when the node lost the contiuity of MRP_InTest frames. Signed-off-by: Horatiu Vultur Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 3 +++ tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cc185a007ade..26842ffd0501 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -344,6 +344,7 @@ enum { IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c532fa65c983..147d52596e17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -152,6 +152,7 @@ static inline size_t br_port_info_size(void) #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + 0; } @@ -216,6 +217,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_NEIGH_SUPPRESS)) || nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & BR_MRP_LOST_CONT)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, + !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cafedbbfefbe..781e482dc499 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -344,6 +344,7 @@ enum { IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From c201324b54553aebb32845193680f21eb493c6e5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:42:42 -0700 Subject: net: caif: drop duplicate words in comments Drop doubled words "or" and "the" in several comments. Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Jakub Kicinski --- include/net/caif/caif_layer.h | 4 ++-- include/uapi/linux/caif/caif_socket.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h index 064094101cb5..51f7bb42a936 100644 --- a/include/net/caif/caif_layer.h +++ b/include/net/caif/caif_layer.h @@ -156,7 +156,7 @@ struct cflayer { * CAIF packets upwards in the stack. * Packet handling rules: * - The CAIF packet (cfpkt) ownership is passed to the - * called receive function. This means that the the + * called receive function. This means that the * packet cannot be accessed after passing it to the * above layer using up->receive(). * @@ -184,7 +184,7 @@ struct cflayer { * CAIF packet downwards in the stack. * Packet handling rules: * - The CAIF packet (cfpkt) ownership is passed to the - * transmit function. This means that the the packet + * transmit function. This means that the packet * cannot be accessed after passing it to the below * layer using dn->transmit(). * diff --git a/include/uapi/linux/caif/caif_socket.h b/include/uapi/linux/caif/caif_socket.h index 10ec1d1cf68e..d9970bbaa156 100644 --- a/include/uapi/linux/caif/caif_socket.h +++ b/include/uapi/linux/caif/caif_socket.h @@ -169,7 +169,7 @@ struct sockaddr_caif { * @CAIFSO_LINK_SELECT: Selector used if multiple CAIF Link layers are * available. Either a high bandwidth * link can be selected (CAIF_LINK_HIGH_BANDW) or - * or a low latency link (CAIF_LINK_LOW_LATENCY). + * a low latency link (CAIF_LINK_LOW_LATENCY). * This option is of type __u32. * Alternatively SO_BINDTODEVICE can be used. * -- cgit v1.2.3 From 644bfe51fa49c22244d24e896cd3fe3ee2f2cfd1 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:37 +0200 Subject: cpumap: Formalize map value as a named struct As it has been already done for devmap, introduce 'struct bpf_cpumap_val' to formalize the expected values that can be passed in for a CPUMAP. Update cpumap code to use the struct. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/754f950674665dae6139c061d28c1d982aaf4170.1594734381.git.lorenzo@kernel.org --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/cpumap.c | 28 +++++++++++++++------------- tools/include/uapi/linux/bpf.h | 9 +++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 323c91c4fab0..ff48dc00e8d0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -52,7 +52,6 @@ struct xdp_bulk_queue { struct bpf_cpu_map_entry { u32 cpu; /* kthread CPU and map index */ int map_id; /* Back reference to map */ - u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; @@ -62,10 +61,13 @@ struct bpf_cpu_map_entry { /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; - struct work_struct kthread_stop_wq; + + struct bpf_cpumap_val value; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; + + struct work_struct kthread_stop_wq; }; struct bpf_cpu_map { @@ -307,8 +309,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, - int map_id) +static struct bpf_cpu_map_entry * +__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -338,13 +340,13 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->queue) goto free_bulkq; - err = ptr_ring_init(rcpu->queue, qsize, gfp); + err = ptr_ring_init(rcpu->queue, value->qsize, gfp); if (err) goto free_queue; rcpu->cpu = cpu; rcpu->map_id = map_id; - rcpu->qsize = qsize; + rcpu->value.qsize = value->qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -437,12 +439,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpumap_val cpumap_value = {}; struct bpf_cpu_map_entry *rcpu; - /* Array index key correspond to CPU number */ u32 key_cpu = *(u32 *)key; - /* Value is the queue size */ - u32 qsize = *(u32 *)value; + + memcpy(&cpumap_value, value, map->value_size); if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -450,18 +452,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; - if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */ return -EOVERFLOW; /* Make sure CPU is a valid possible cpu */ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu)) return -ENODEV; - if (qsize == 0) { + if (cpumap_value.qsize == 0) { rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; @@ -523,7 +525,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_cpu_map_entry *rcpu = __cpu_map_lookup_elem(map, *(u32 *)key); - return rcpu ? &rcpu->qsize : NULL; + return rcpu ? &rcpu->value : NULL; } static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e386389913a..109623527358 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3849,6 +3849,15 @@ struct bpf_devmap_val { } bpf_prog; }; +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ +}; + enum sk_action { SK_DROP = 0, SK_PASS, -- cgit v1.2.3 From 9216477449f33cdbc9c9a99d49f500b7fbb81702 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 14 Jul 2020 15:56:38 +0200 Subject: bpf: cpumap: Add the possibility to attach an eBPF program to cpumap Introduce the capability to attach an eBPF program to cpumap entries. The idea behind this feature is to add the possibility to define on which CPU run the eBPF program if the underlying hw does not support RSS. Current supported verdicts are XDP_DROP and XDP_PASS. This patch has been tested on Marvell ESPRESSObin using xdp_redirect_cpu sample available in the kernel tree to identify possible performance regressions. Results show there are no observable differences in packet-per-second: $./xdp_redirect_cpu --progname xdp_cpu_map0 --dev eth0 --cpu 1 rx: 354.8 Kpps rx: 356.0 Kpps rx: 356.8 Kpps rx: 356.3 Kpps rx: 356.6 Kpps rx: 356.6 Kpps rx: 356.7 Kpps rx: 355.8 Kpps rx: 356.8 Kpps rx: 356.8 Kpps Co-developed-by: Jesper Dangaard Brouer Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/5c9febdf903d810b3415732e5cd98491d7d9067a.1594734381.git.lorenzo@kernel.org --- include/linux/bpf.h | 6 ++ include/net/xdp.h | 5 ++ include/trace/events/xdp.h | 14 +++-- include/uapi/linux/bpf.h | 5 ++ kernel/bpf/cpumap.c | 121 ++++++++++++++++++++++++++++++++++++----- net/core/dev.c | 9 +++ tools/include/uapi/linux/bpf.h | 5 ++ 7 files changed, 148 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c67c88ad35f8..54ad426dbea1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1272,6 +1272,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); +bool cpu_map_prog_allowed(struct bpf_map *map); /* Return map's numa specified by userspace */ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) @@ -1432,6 +1433,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, return 0; } +static inline bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return false; +} + static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/net/xdp.h b/include/net/xdp.h index 5b383c450858..83b9e0142b52 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -98,6 +98,11 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +struct xdp_cpumap_stats { + unsigned int pass; + unsigned int drop; +}; + /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b73d3e141323..e2c99f5bee39 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, - int sched), + int sched, struct xdp_cpumap_stats *xdp_stats), - TP_ARGS(map_id, processed, drops, sched), + TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) @@ -188,6 +188,8 @@ TRACE_EVENT(xdp_cpumap_kthread, __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) + __field(unsigned int, xdp_pass) + __field(unsigned int, xdp_drop) ), TP_fast_assign( @@ -197,16 +199,20 @@ TRACE_EVENT(xdp_cpumap_kthread, __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; + __entry->xdp_pass = xdp_stats->pass; + __entry->xdp_drop = xdp_stats->drop; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" - " sched=%d", + " sched=%d" + " xdp_pass=%u xdp_drop=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, - __entry->sched) + __entry->sched, + __entry->xdp_pass, __entry->xdp_drop) ); TRACE_EVENT(xdp_cpumap_enqueue, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ff48dc00e8d0..b3a8aea81ee5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -63,6 +63,7 @@ struct bpf_cpu_map_entry { struct task_struct *kthread; struct bpf_cpumap_val value; + struct bpf_prog *prog; atomic_t refcnt; /* Control when this struct can be free'ed */ struct rcu_head rcu; @@ -82,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { + u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; u64 cost; @@ -92,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + (value_size != offsetofend(struct bpf_cpumap_val, qsize) && + value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) || + attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); cmap = kzalloc(sizeof(*cmap), GFP_USER); @@ -214,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { if (atomic_dec_and_test(&rcpu->refcnt)) { + if (rcpu->prog) + bpf_prog_put(rcpu->prog); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); @@ -222,6 +228,62 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, + void **frames, int n, + struct xdp_cpumap_stats *stats) +{ + struct xdp_rxq_info rxq; + struct xdp_buff xdp; + int i, nframes = 0; + + if (!rcpu->prog) + return n; + + rcu_read_lock(); + + xdp_set_return_frame_no_direct(); + xdp.rxq = &rxq; + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + u32 act; + int err; + + rxq.dev = xdpf->dev_rx; + rxq.mem = xdpf->mem; + /* TODO: report queue_index to xdp_rxq_info */ + + xdp_convert_frame_to_buff(xdpf, &xdp); + + act = bpf_prog_run_xdp(rcpu->prog, &xdp); + switch (act) { + case XDP_PASS: + err = xdp_update_frame_from_buff(&xdp, xdpf); + if (err < 0) { + xdp_return_frame(xdpf); + stats->drop++; + } else { + frames[nframes++] = xdpf; + stats->pass++; + } + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fallthrough */ + case XDP_DROP: + xdp_return_frame(xdpf); + stats->drop++; + break; + } + } + + xdp_clear_return_frame_no_direct(); + + rcu_read_unlock(); + + return nframes; +} + #define CPUMAP_BATCH 8 static int cpu_map_kthread_run(void *data) @@ -236,11 +298,12 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_cpumap_stats stats = {}; /* zero stats */ + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m; + int i, n, m, nframes; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -261,8 +324,8 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - + n = __ptr_ring_consume_batched(rcpu->queue, frames, + CPUMAP_BATCH); for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page = virt_to_page(f); @@ -274,15 +337,19 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } - m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < n; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - drops = n; + /* Support running another XDP prog on this CPU */ + nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats); + if (nframes) { + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < nframes; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops += nframes; + } } local_bh_disable(); - for (i = 0; i < n; i++) { + for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; int ret; @@ -299,7 +366,7 @@ static int cpu_map_kthread_run(void *data) drops++; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -309,13 +376,38 @@ static int cpu_map_kthread_run(void *data) return 0; } +bool cpu_map_prog_allowed(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_CPUMAP && + map->value_size != offsetofend(struct bpf_cpumap_val, qsize); +} + +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +{ + struct bpf_prog *prog; + + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + bpf_prog_put(prog); + return -EINVAL; + } + + rcpu->value.bpf_prog.id = prog->aux->id; + rcpu->prog = prog; + + return 0; +} + static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) { + int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; - int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -357,6 +449,9 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + goto free_ptr_ring; + /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); diff --git a/net/core/dev.c b/net/core/dev.c index b61075828358..b820527f0a8d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5448,6 +5448,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) for (i = 0; i < new->aux->used_map_cnt; i++) { if (dev_map_can_have_prog(new->aux->used_maps[i])) return -EINVAL; + if (cpu_map_prog_allowed(new->aux->used_maps[i])) + return -EINVAL; } } @@ -8875,6 +8877,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_CPUMAP) { + NL_SET_ERR_MSG(extack, + "BPF_XDP_CPUMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 109623527358..c010b57fce3f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -227,6 +227,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETSOCKNAME, BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, __MAX_BPF_ATTACH_TYPE }; @@ -3856,6 +3857,10 @@ struct bpf_devmap_val { */ struct bpf_cpumap_val { __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; }; enum sk_action { -- cgit v1.2.3 From bfdfa51702dec67e9fcd52568b4cf3c7f799db8b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 18:29:11 -0700 Subject: bpf: Drop duplicated words in uapi helper comments Drop doubled words "will" and "attach". Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6b9f71ae-4f8e-0259-2c5d-187ddaefe6eb@infradead.org --- include/uapi/linux/bpf.h | 6 +++--- tools/include/uapi/linux/bpf.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c010b57fce3f..7ac3992dacfe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2420,7 +2420,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -2457,7 +2457,7 @@ union bpf_attr { * Look for an IPv6 socket. * * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will + * socket lookup table in the netns associated with the *ctx* * will be used. For the TC hooks, this is the netns of the device * in the skb. For socket hooks, this is the netns of the socket. * If *netns* is any other signed 32-bit value greater than or @@ -4000,7 +4000,7 @@ struct bpf_link_info { /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). + * attach type). */ struct bpf_sock_addr { __u32 user_family; /* Allows 4-byte read, but no write. */ -- cgit v1.2.3 From e3a5a1e8b6548f5d37328e2d3571edc5c9e6d7c0 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Thu, 16 Jul 2020 12:12:35 -0700 Subject: tcp: add SNMP counter for no. of duplicate segments reported by DSACK There are two existing SNMP counters, TCPDSACKRecv and TCPDSACKOfoRecv, which are incremented depending on whether the DSACKed range is below the cumulative ACK sequence number or not. Unfortunately, these both implicitly assume each DSACK covers only one segment. This makes these counters unusable for estimating spurious retransmit rates, or real/non-spurious loss rate. This patch introduces a new SNMP counter, TCPDSACKRecvSegs, which tracks the estimated number of duplicate segments based on: (DSACKed sequence range) / MSS. This counter is usable for estimating spurious retransmit rates, or real/non-spurious loss rate. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 7d91f4debc48..cee9f8e6fce3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -287,6 +287,7 @@ enum LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ + LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 75545a829a2b..1074df726ec0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), + SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5d6bbcb1e570..82906deb7874 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1153,6 +1153,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ if (tp->undo_marker && tp->undo_retrans > 0 && -- cgit v1.2.3 From e9ddbb7707ff5891616240026062b8c1e29864ca Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 17 Jul 2020 12:35:23 +0200 Subject: bpf: Introduce SK_LOOKUP program type with a dedicated attach point Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 3 + include/linux/bpf.h | 1 + include/linux/bpf_types.h | 2 + include/linux/filter.h | 17 +++++ include/uapi/linux/bpf.h | 77 +++++++++++++++++++ kernel/bpf/net_namespace.c | 5 ++ kernel/bpf/syscall.c | 9 +++ kernel/bpf/verifier.c | 13 +++- net/core/filter.c | 180 +++++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 9 ++- 10 files changed, 312 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 47d5b0c708c9..722f799c1a2e 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -8,6 +8,7 @@ enum netns_bpf_attach_type { NETNS_BPF_INVALID = -1, NETNS_BPF_FLOW_DISSECTOR = 0, + NETNS_BPF_SK_LOOKUP, MAX_NETNS_BPF_ATTACH_TYPE }; @@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type) switch (attach_type) { case BPF_FLOW_DISSECTOR: return NETNS_BPF_FLOW_DISSECTOR; + case BPF_SK_LOOKUP: + return NETNS_BPF_SK_LOOKUP; default: return NETNS_BPF_INVALID; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c8c9eabcd106..adb16bdc5f0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -249,6 +249,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a18ae82a298a..a52a5688418e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, struct sk_reuseport_md, struct sk_reuseport_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup, + struct bpf_sk_lookup, struct bpf_sk_lookup_kern) #endif #if defined(CONFIG_BPF_JIT) BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, diff --git a/include/linux/filter.h b/include/linux/filter.h index 0b0144752d78..fa1ea12ad2cd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1278,4 +1278,21 @@ struct bpf_sockopt_kern { s32 retval; }; +struct bpf_sk_lookup_kern { + u16 family; + u16 protocol; + struct { + __be32 saddr; + __be32 daddr; + } v4; + struct { + const struct in6_addr *saddr; + const struct in6_addr *daddr; + } v6; + __be16 sport; + u16 dport; + struct sock *selected_sk; + bool no_reuseport; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7ac3992dacfe..54d0c886e3ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -189,6 +189,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, }; enum bpf_attach_type { @@ -228,6 +229,7 @@ enum bpf_attach_type { BPF_XDP_DEVMAP, BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, __MAX_BPF_ATTACH_TYPE }; @@ -3069,6 +3071,10 @@ union bpf_attr { * * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * * Assign the *sk* to the *skb*. When combined with appropriate * routing configuration to receive the packet towards the socket, * will cause *skb* to be delivered to the specified socket. @@ -3094,6 +3100,56 @@ union bpf_attr { * **-ESOCKTNOSUPPORT** if the socket type is not supported * (reuseport). * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * * u64 bpf_ktime_get_boot_ns(void) * Description * Return the time elapsed since system boot, in nanoseconds. @@ -3607,6 +3663,12 @@ enum { BPF_RINGBUF_HDR_SZ = 8, }; +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, @@ -4349,4 +4411,19 @@ struct bpf_pidns_info { __u32 pid; __u32 tgid; }; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __u32 remote_port; /* Network byte order */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index e9c8e26ac8f2..38b368bccda2 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -373,6 +373,8 @@ static int netns_bpf_max_progs(enum netns_bpf_attach_type type) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: return 1; + case NETNS_BPF_SK_LOOKUP: + return 64; default: return 0; } @@ -403,6 +405,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, case NETNS_BPF_FLOW_DISSECTOR: err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; + case NETNS_BPF_SK_LOOKUP: + err = 0; /* nothing to check */ + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7ea9dfbebd8c..d07417d17712 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_SK_LOOKUP: + if (expected_attach_type == BPF_SK_LOOKUP) + return 0; + return -EINVAL; case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; @@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!capable(CAP_NET_ADMIN)) @@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: return BPF_PROG_TYPE_TRACING; + case BPF_SK_LOOKUP: + return BPF_PROG_TYPE_SK_LOOKUP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: + case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; @@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr) ret = tracing_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c1efc9d08fd..9a6703bc3f36 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } meta->ref_obj_id = reg->ref_obj_id; } - } else if (arg_type == ARG_PTR_TO_SOCKET) { + } else if (arg_type == ARG_PTR_TO_SOCKET || + arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { expected_type = PTR_TO_SOCKET; - if (type != expected_type) - goto err_type; + if (!(register_is_null(reg) && + arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { + if (type != expected_type) + goto err_type; + } } else if (arg_type == ARG_PTR_TO_BTF_ID) { expected_type = PTR_TO_BTF_ID; if (type != expected_type) @@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env) return -ENOTSUPP; } break; + case BPF_PROG_TYPE_SK_LOOKUP: + range = tnum_range(SK_DROP, SK_PASS); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. diff --git a/net/core/filter.c b/net/core/filter.c index bdd2382e655d..d099436b3ff5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9229,6 +9229,186 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = { const struct bpf_prog_ops sk_reuseport_prog_ops = { }; + +BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, + struct sock *, sk, u64, flags) +{ + if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | + BPF_SK_LOOKUP_F_NO_REUSEPORT))) + return -EINVAL; + if (unlikely(sk && sk_is_refcounted(sk))) + return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ + if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED)) + return -ESOCKTNOSUPPORT; /* reject connected sockets */ + + /* Check if socket is suitable for packet L3/L4 protocol */ + if (sk && sk->sk_protocol != ctx->protocol) + return -EPROTOTYPE; + if (sk && sk->sk_family != ctx->family && + (sk->sk_family == AF_INET || ipv6_only_sock(sk))) + return -EAFNOSUPPORT; + + if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) + return -EEXIST; + + /* Select socket as lookup result */ + ctx->selected_sk = sk; + ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; + return 0; +} + +static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { + .func = bpf_sk_lookup_assign, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_sk_assign: + return &bpf_sk_lookup_assign_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool sk_lookup_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) + return false; + if (off % size != 0) + return false; + if (type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sk_lookup, sk): + info->reg_type = PTR_TO_SOCKET_OR_NULL; + return size == sizeof(__u64); + + case bpf_ctx_range(struct bpf_sk_lookup, family): + case bpf_ctx_range(struct bpf_sk_lookup, protocol): + case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): + case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): + case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case bpf_ctx_range(struct bpf_sk_lookup, local_port): + bpf_ctx_record_field_size(info, sizeof(__u32)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + + default: + return false; + } +} + +static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sk_lookup, sk): + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, selected_sk)); + break; + + case offsetof(struct bpf_sk_lookup, family): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + family, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, protocol): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + protocol, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, remote_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.saddr, 4, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_ip4): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + v4.daddr, 4, target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sk_lookup, + remote_ip6[0], remote_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.saddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case bpf_ctx_range_till(struct bpf_sk_lookup, + local_ip6[0], local_ip6[3]): { +#if IS_ENABLED(CONFIG_IPV6) + int off = si->off; + + off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); + off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, + offsetof(struct bpf_sk_lookup_kern, v6.daddr)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + } + case offsetof(struct bpf_sk_lookup, remote_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + sport, 2, target_size)); + break; + + case offsetof(struct bpf_sk_lookup, local_port): + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + dport, 2, target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_prog_ops sk_lookup_prog_ops = { +}; + +const struct bpf_verifier_ops sk_lookup_verifier_ops = { + .get_func_proto = sk_lookup_func_proto, + .is_valid_access = sk_lookup_is_valid_access, + .convert_ctx_access = sk_lookup_convert_ctx_access, +}; + #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6843376733df..5bfa448b4704 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -404,6 +404,7 @@ class PrinterHelpers(Printer): type_fwds = [ 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', @@ -450,6 +451,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', @@ -487,6 +489,11 @@ class PrinterHelpers(Printer): 'struct sk_msg_buff': 'struct sk_msg_md', 'struct xdp_buff': 'struct xdp_md', } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] def print_header(self): header = '''\ @@ -543,7 +550,7 @@ class PrinterHelpers(Printer): for i, a in enumerate(proto['args']): t = a['type'] n = a['name'] - if proto['name'] == 'bpf_get_socket_cookie' and i == 0: + if proto['name'] in self.overloaded_helpers and i == 0: t = 'void' n = 'ctx' one_arg = '{}{}'.format(comma, self.map_type(t)) -- cgit v1.2.3 From c4471ad9a50d5548e66ae4511acfb1dc23a48744 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 20 Jul 2020 00:03:33 +0200 Subject: net: phy: add USXGMII link partner ability constants The constants are taken from the USXGMII Singleport Copper Interface specification. The naming are based on the SGMII ones, but with an MDIO_ prefix. Signed-off-by: Michael Walle Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/uapi/linux/mdio.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 4bcb41c71b8c..3f302e2523b2 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -324,4 +324,30 @@ static inline __u16 mdio_phy_id_c45(int prtad, int devad) return MDIO_PHY_ID_C45 | (prtad << 5) | devad; } +/* UsxgmiiChannelInfo[15:0] for USXGMII in-band auto-negotiation.*/ +#define MDIO_USXGMII_EEE_CLK_STP 0x0080 /* EEE clock stop supported */ +#define MDIO_USXGMII_EEE 0x0100 /* EEE supported */ +#define MDIO_USXGMII_SPD_MASK 0x0e00 /* USXGMII speed mask */ +#define MDIO_USXGMII_FULL_DUPLEX 0x1000 /* USXGMII full duplex */ +#define MDIO_USXGMII_DPX_SPD_MASK 0x1e00 /* USXGMII duplex and speed bits */ +#define MDIO_USXGMII_10 0x0000 /* 10Mbps */ +#define MDIO_USXGMII_10HALF 0x0000 /* 10Mbps half-duplex */ +#define MDIO_USXGMII_10FULL 0x1000 /* 10Mbps full-duplex */ +#define MDIO_USXGMII_100 0x0200 /* 100Mbps */ +#define MDIO_USXGMII_100HALF 0x0200 /* 100Mbps half-duplex */ +#define MDIO_USXGMII_100FULL 0x1200 /* 100Mbps full-duplex */ +#define MDIO_USXGMII_1000 0x0400 /* 1000Mbps */ +#define MDIO_USXGMII_1000HALF 0x0400 /* 1000Mbps half-duplex */ +#define MDIO_USXGMII_1000FULL 0x1400 /* 1000Mbps full-duplex */ +#define MDIO_USXGMII_10G 0x0600 /* 10Gbps */ +#define MDIO_USXGMII_10GHALF 0x0600 /* 10Gbps half-duplex */ +#define MDIO_USXGMII_10GFULL 0x1600 /* 10Gbps full-duplex */ +#define MDIO_USXGMII_2500 0x0800 /* 2500Mbps */ +#define MDIO_USXGMII_2500HALF 0x0800 /* 2500Mbps half-duplex */ +#define MDIO_USXGMII_2500FULL 0x1800 /* 2500Mbps full-duplex */ +#define MDIO_USXGMII_5000 0x0a00 /* 5000Mbps */ +#define MDIO_USXGMII_5000HALF 0x0a00 /* 5000Mbps half-duplex */ +#define MDIO_USXGMII_5000FULL 0x1a00 /* 5000Mbps full-duplex */ +#define MDIO_USXGMII_LINK 0x8000 /* PHY link with copper-side partner */ + #endif /* _UAPI__LINUX_MDIO_H__ */ -- cgit v1.2.3 From 55db9c0e853421fa71cac5e6855898601f78a1f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jul 2020 08:23:15 +0200 Subject: net: remove compat_sys_{get,set}sockopt Now that the ->compat_{get,set}sockopt proto_ops methods are gone there is no good reason left to keep the compat syscalls separate. This fixes the odd use of unsigned int for the compat_setsockopt optlen and the missing sock_use_custom_sol_socket. It would also easily allow running the eBPF hooks for the compat syscalls, but such a large change in behavior does not belong into a consolidation patch like this one. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- arch/arm64/include/asm/unistd32.h | 4 +- arch/mips/kernel/syscalls/syscall_n32.tbl | 4 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 4 +- arch/parisc/kernel/syscalls/syscall.tbl | 4 +- arch/powerpc/kernel/syscalls/syscall.tbl | 4 +- arch/s390/kernel/syscalls/syscall.tbl | 4 +- arch/sparc/kernel/sys32.S | 12 ++-- arch/sparc/kernel/syscalls/syscall.tbl | 4 +- arch/x86/entry/syscall_x32.c | 7 ++ arch/x86/entry/syscalls/syscall_32.tbl | 4 +- arch/x86/entry/syscalls/syscall_64.tbl | 4 +- include/linux/compat.h | 4 -- include/linux/syscalls.h | 4 ++ include/uapi/asm-generic/unistd.h | 4 +- net/compat.c | 79 +--------------------- net/socket.c | 25 ++++--- tools/include/uapi/asm-generic/unistd.h | 4 +- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 4 +- tools/perf/arch/s390/entry/syscalls/syscall.tbl | 4 +- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 4 +- 20 files changed, 62 insertions(+), 125 deletions(-) (limited to 'include/uapi') diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 6d95d0c8bf2f..166e36903110 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -599,9 +599,9 @@ __SYSCALL(__NR_recvfrom, compat_sys_recvfrom) #define __NR_shutdown 293 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_setsockopt 294 -__SYSCALL(__NR_setsockopt, compat_sys_setsockopt) +__SYSCALL(__NR_setsockopt, sys_setsockopt) #define __NR_getsockopt 295 -__SYSCALL(__NR_getsockopt, compat_sys_getsockopt) +__SYSCALL(__NR_getsockopt, sys_getsockopt) #define __NR_sendmsg 296 __SYSCALL(__NR_sendmsg, compat_sys_sendmsg) #define __NR_recvmsg 297 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index f777141f5256..8488b0d0a99e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -60,8 +60,8 @@ 50 n32 getsockname sys_getsockname 51 n32 getpeername sys_getpeername 52 n32 socketpair sys_socketpair -53 n32 setsockopt compat_sys_setsockopt -54 n32 getsockopt compat_sys_getsockopt +53 n32 setsockopt sys_setsockopt +54 n32 getsockopt sys_getsockopt 55 n32 clone __sys_clone 56 n32 fork __sys_fork 57 n32 execve compat_sys_execve diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 13280625d312..b20522f813f9 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -184,7 +184,7 @@ 170 o32 connect sys_connect 171 o32 getpeername sys_getpeername 172 o32 getsockname sys_getsockname -173 o32 getsockopt sys_getsockopt compat_sys_getsockopt +173 o32 getsockopt sys_getsockopt sys_getsockopt 174 o32 listen sys_listen 175 o32 recv sys_recv compat_sys_recv 176 o32 recvfrom sys_recvfrom compat_sys_recvfrom @@ -192,7 +192,7 @@ 178 o32 send sys_send 179 o32 sendmsg sys_sendmsg compat_sys_sendmsg 180 o32 sendto sys_sendto -181 o32 setsockopt sys_setsockopt compat_sys_setsockopt +181 o32 setsockopt sys_setsockopt sys_setsockopt 182 o32 shutdown sys_shutdown 183 o32 socket sys_socket 184 o32 socketpair sys_socketpair diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 5a758fa6ec52..3494e4fa1a17 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -198,8 +198,8 @@ 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common chown sys_chown -181 common setsockopt sys_setsockopt compat_sys_setsockopt -182 common getsockopt sys_getsockopt compat_sys_getsockopt +181 common setsockopt sys_setsockopt sys_setsockopt +182 common getsockopt sys_getsockopt sys_getsockopt 183 common sendmsg sys_sendmsg compat_sys_sendmsg 184 common recvmsg sys_recvmsg compat_sys_recvmsg 185 common semop sys_semop diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index f833a3190822..94eb5b27ef65 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -433,8 +433,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index bfdcb7633957..0d63c71fc544 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -372,8 +372,8 @@ 362 common connect sys_connect sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname sys_getsockname 368 common getpeername sys_getpeername sys_getpeername 369 common sendto sys_sendto sys_sendto diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S index 489ffab918a8..a45f0f31fe51 100644 --- a/arch/sparc/kernel/sys32.S +++ b/arch/sparc/kernel/sys32.S @@ -157,22 +157,22 @@ do_sys_shutdown: /* sys_shutdown(int, int) */ nop nop nop -do_sys_setsockopt: /* compat_sys_setsockopt(int, int, int, char *, int) */ +do_sys_setsockopt: /* sys_setsockopt(int, int, int, char *, int) */ 47: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_setsockopt), %g1 + sethi %hi(sys_setsockopt), %g1 48: ldswa [%o1 + 0x8] %asi, %o2 49: lduwa [%o1 + 0xc] %asi, %o3 50: ldswa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(compat_sys_setsockopt), %g0 + jmpl %g1 + %lo(sys_setsockopt), %g0 51: ldswa [%o1 + 0x4] %asi, %o1 nop -do_sys_getsockopt: /* compat_sys_getsockopt(int, int, int, u32, u32) */ +do_sys_getsockopt: /* sys_getsockopt(int, int, int, u32, u32) */ 52: ldswa [%o1 + 0x0] %asi, %o0 - sethi %hi(compat_sys_getsockopt), %g1 + sethi %hi(sys_getsockopt), %g1 53: ldswa [%o1 + 0x8] %asi, %o2 54: lduwa [%o1 + 0xc] %asi, %o3 55: lduwa [%o1 + 0x10] %asi, %o4 - jmpl %g1 + %lo(compat_sys_getsockopt), %g0 + jmpl %g1 + %lo(sys_getsockopt), %g0 56: ldswa [%o1 + 0x4] %asi, %o1 nop do_sys_sendmsg: /* compat_sys_sendmsg(int, struct compat_msghdr *, unsigned int) */ diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 8004a276cb74..c59b37965add 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -147,7 +147,7 @@ 115 32 getgroups32 sys_getgroups 116 common gettimeofday sys_gettimeofday compat_sys_gettimeofday 117 common getrusage sys_getrusage compat_sys_getrusage -118 common getsockopt sys_getsockopt compat_sys_getsockopt +118 common getsockopt sys_getsockopt sys_getsockopt 119 common getcwd sys_getcwd 120 common readv sys_readv compat_sys_readv 121 common writev sys_writev compat_sys_writev @@ -425,7 +425,7 @@ 352 common userfaultfd sys_userfaultfd 353 common bind sys_bind 354 common listen sys_listen -355 common setsockopt sys_setsockopt compat_sys_setsockopt +355 common setsockopt sys_setsockopt sys_setsockopt 356 common mlock2 sys_mlock2 357 common copy_file_range sys_copy_file_range 358 common preadv2 sys_preadv2 compat_sys_preadv2 diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3d8d70d3896c..1583831f61a9 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,6 +8,13 @@ #include #include +/* + * Reuse the 64-bit entry points for the x32 versions that occupy different + * slots in the syscall table. + */ +#define __x32_sys_getsockopt __x64_sys_getsockopt +#define __x32_sys_setsockopt __x64_sys_setsockopt + #define __SYSCALL_64(nr, sym) #define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..43742a69dba1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -376,8 +376,8 @@ 362 i386 connect sys_connect 363 i386 listen sys_listen 364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt 367 i386 getsockname sys_getsockname 368 i386 getpeername sys_getpeername 369 i386 sendto sys_sendto diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..e008d638e641 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -396,8 +396,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat diff --git a/include/linux/compat.h b/include/linux/compat.h index e90100c0de72..c4255d8a4a8a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -737,10 +737,6 @@ asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg); asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len, unsigned flags, struct sockaddr __user *addr, int __user *addrlen); -asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen); -asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen); asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags); asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b951a87da987..aa46825c6f9d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1424,4 +1424,8 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, unsigned int nsops, const struct old_timespec32 __user *timeout); +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen); +int __sys_setsockopt(int fd, int level, int optname, char __user *optval, + int optlen); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f4a01305d9a6..c8c189a5f0a6 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -606,9 +606,9 @@ __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 __SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, sys_setsockopt) #define __NR_getsockopt 209 -__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 diff --git a/net/compat.c b/net/compat.c index 3e6c2c5ff260..091875bd6210 100644 --- a/net/compat.c +++ b/net/compat.c @@ -335,77 +335,6 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) __scm_destroy(scm); } -static int __compat_sys_setsockopt(int fd, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - struct socket *sock; - - if (optlen > INT_MAX) - return -EINVAL; - - sock = sockfd_lookup(fd, &err); - if (sock) { - err = security_socket_setsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_setsockopt) - err = sock->ops->compat_setsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->setsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, - char __user *, optval, unsigned int, optlen) -{ - return __compat_sys_setsockopt(fd, level, optname, optval, optlen); -} - -static int __compat_sys_getsockopt(int fd, int level, int optname, - char __user *optval, - int __user *optlen) -{ - int err; - struct socket *sock = sockfd_lookup(fd, &err); - - if (sock) { - err = security_socket_getsockopt(sock, level, optname); - if (err) { - sockfd_put(sock); - return err; - } - - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, - optname, optval, optlen); - else if (sock->ops->compat_getsockopt) - err = sock->ops->compat_getsockopt(sock, level, - optname, optval, optlen); - else - err = sock->ops->getsockopt(sock, level, - optname, optval, optlen); - sockfd_put(sock); - } - return err; -} - -COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, - char __user *, optval, int __user *, optlen) -{ - return __compat_sys_getsockopt(fd, level, optname, optval, optlen); -} - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { @@ -565,13 +494,11 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: - ret = __compat_sys_setsockopt(a0, a1, a[2], - compat_ptr(a[3]), a[4]); + ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]); break; case SYS_GETSOCKOPT: - ret = __compat_sys_getsockopt(a0, a1, a[2], - compat_ptr(a[3]), - compat_ptr(a[4])); + ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]), + compat_ptr(a[4])); break; case SYS_SENDMSG: ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); diff --git a/net/socket.c b/net/socket.c index b79376b17b45..dec345982abb 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2094,9 +2094,8 @@ static bool sock_use_custom_sol_socket(const struct socket *sock) * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */ - -static int __sys_setsockopt(int fd, int level, int optname, - char __user *optval, int optlen) +int __sys_setsockopt(int fd, int level, int optname, char __user *optval, + int optlen) { mm_segment_t oldfs = get_fs(); char *kernel_optval = NULL; @@ -2114,8 +2113,10 @@ static int __sys_setsockopt(int fd, int level, int optname, if (err) goto out_put; - err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, - optval, &optlen, &kernel_optval); + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname, + optval, &optlen, + &kernel_optval); if (err < 0) goto out_put; if (err > 0) { @@ -2154,9 +2155,8 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */ - -static int __sys_getsockopt(int fd, int level, int optname, - char __user *optval, int __user *optlen) +int __sys_getsockopt(int fd, int level, int optname, char __user *optval, + int __user *optlen) { int err, fput_needed; struct socket *sock; @@ -2170,7 +2170,8 @@ static int __sys_getsockopt(int fd, int level, int optname, if (err) goto out_put; - max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + if (!in_compat_syscall()) + max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); if (level == SOL_SOCKET) err = sock_getsockopt(sock, level, optname, optval, optlen); @@ -2178,8 +2179,10 @@ static int __sys_getsockopt(int fd, int level, int optname, err = sock->ops->getsockopt(sock, level, optname, optval, optlen); - err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, optval, - optlen, max_optlen, err); + if (!in_compat_syscall()) + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname, + optval, optlen, max_optlen, + err); out_put: fput_light(sock->file, fput_needed); return err; diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index f4a01305d9a6..c8c189a5f0a6 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -606,9 +606,9 @@ __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 __SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, sys_setsockopt) #define __NR_getsockopt 209 -__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 35b61bfc1b1a..b190f2eb2611 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -427,8 +427,8 @@ 336 common recv sys_recv compat_sys_recv 337 common recvfrom sys_recvfrom compat_sys_recvfrom 338 common shutdown sys_shutdown -339 common setsockopt sys_setsockopt compat_sys_setsockopt -340 common getsockopt sys_getsockopt compat_sys_getsockopt +339 common setsockopt sys_setsockopt sys_setsockopt +340 common getsockopt sys_getsockopt sys_getsockopt 341 common sendmsg sys_sendmsg compat_sys_sendmsg 342 common recvmsg sys_recvmsg compat_sys_recvmsg 343 32 recvmmsg sys_recvmmsg_time32 compat_sys_recvmmsg_time32 diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index b38d48464368..56ae24b6e4be 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -372,8 +372,8 @@ 362 common connect sys_connect compat_sys_connect 363 common listen sys_listen sys_listen 364 common accept4 sys_accept4 compat_sys_accept4 -365 common getsockopt sys_getsockopt compat_sys_getsockopt -366 common setsockopt sys_setsockopt compat_sys_setsockopt +365 common getsockopt sys_getsockopt sys_getsockopt +366 common setsockopt sys_setsockopt sys_setsockopt 367 common getsockname sys_getsockname compat_sys_getsockname 368 common getpeername sys_getpeername compat_sys_getpeername 369 common sendto sys_sendto compat_sys_sendto diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..e008d638e641 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -396,8 +396,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat -- cgit v1.2.3 From eba75c587e811d3249c8bd50d22bb2266ccd3c0f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 10 Jul 2020 09:29:02 -0400 Subject: icmp: support rfc 4884 Add setsockopt SOL_IP/IP_RECVERR_4884 to return the offset to an extension struct if present. ICMP messages may include an extension structure after the original datagram. RFC 4884 standardized this behavior. It stores the offset in words to the extension header in u8 icmphdr.un.reserved[1]. The field is valid only for ICMP types destination unreachable, time exceeded and parameter problem, if length is at least 128 bytes and entire packet does not exceed 576 bytes. Return the offset to the start of the extension struct when reading an ICMP error from the error queue, if it matches the above constraints. Do not return the raw u8 field. Return the offset from the start of the user buffer, in bytes. The kernel does not return the network and transport headers, so subtract those. Also validate the headers. Return the offset regardless of validation, as an invalid extension must still not be misinterpreted as part of the original datagram. Note that !invalid does not imply valid. If the extension version does not match, no validation can take place, for instance. For backward compatibility, make this optional, set by setsockopt SOL_IP/IP_RECVERR_RFC4884. For API example and feature test, see github.com/wdebruij/kerneltools/blob/master/tests/recv_icmp_v2.c For forward compatibility, reserve only setsockopt value 1, leaving other bits for additional icmp extensions. Changes v1->v2: - convert word offset to byte offset from start of user buffer - return in ee_data as u8 may be insufficient - define extension struct and object header structs - return len only if constraints met - if returning len, also validate Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/icmp.h | 4 +++ include/net/inet_sock.h | 1 + include/uapi/linux/errqueue.h | 14 ++++++++- include/uapi/linux/icmp.h | 22 ++++++++++++++ include/uapi/linux/in.h | 1 + net/ipv4/icmp.c | 71 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_sockglue.c | 12 ++++++++ 7 files changed, 124 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 81ca84ce3119..8fc38a34cb20 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -15,6 +15,7 @@ #include #include +#include static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { @@ -35,4 +36,7 @@ static inline bool icmp_is_err(int type) return false; } +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out); + #endif /* _LINUX_ICMP_H */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a7ce00af6c44..a3702d1d4875 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -225,6 +225,7 @@ struct inet_sock { mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, + recverr_rfc4884:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index ca5cb3e3c6df..3c70e8ac14b8 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -5,6 +5,13 @@ #include #include +/* RFC 4884: return offset to extension struct + validation */ +struct sock_ee_data_rfc4884 { + __u16 len; + __u8 flags; + __u8 reserved; +}; + struct sock_extended_err { __u32 ee_errno; __u8 ee_origin; @@ -12,7 +19,10 @@ struct sock_extended_err { __u8 ee_code; __u8 ee_pad; __u32 ee_info; - __u32 ee_data; + union { + __u32 ee_data; + struct sock_ee_data_rfc4884 ee_rfc4884; + }; }; #define SO_EE_ORIGIN_NONE 0 @@ -31,6 +41,8 @@ struct sock_extended_err { #define SO_EE_CODE_TXTIME_INVALID_PARAM 1 #define SO_EE_CODE_TXTIME_MISSED 2 +#define SO_EE_RFC4884_FLAG_INVALID 1 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index 5589eeb791ca..fb169a50895e 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -19,6 +19,7 @@ #define _UAPI_LINUX_ICMP_H #include +#include #define ICMP_ECHOREPLY 0 /* Echo Reply */ #define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ @@ -95,5 +96,26 @@ struct icmp_filter { __u32 data; }; +/* RFC 4884 extension struct: one per message */ +struct icmp_ext_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 reserved1:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + reserved1:4; +#else +#error "Please fix " +#endif + __u8 reserved2; + __sum16 checksum; +}; + +/* RFC 4884 extension object header: one for each object */ +struct icmp_extobj_hdr { + __be16 length; + __u8 class_num; + __u8 class_type; +}; #endif /* _UAPI_LINUX_ICMP_H */ diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 8533bf07450f..3d0d8231dc19 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -123,6 +123,7 @@ struct in_addr { #define IP_CHECKSUM 23 #define IP_BIND_ADDRESS_NO_PORT 24 #define IP_RECVFRAGSIZE 25 +#define IP_RECVERR_RFC4884 26 /* IP_MTU_DISCOVER values */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */ diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e30515f89802..793aebf07c2a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1116,6 +1116,77 @@ error: goto drop; } +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + int hlen, off; + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return; + } + + /* outer headers up to inner iph. skb->data is at inner payload */ + hlen = -skb_transport_offset(skb) - sizeof(struct icmphdr); + + /* per rfc 791: maximum packet length of 576 bytes */ + if (hlen + skb->len > 576) + return; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + off = icmp_hdr(skb)->un.reserved[1] * sizeof(u32); + if (off < 128) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} + int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 86b3b9a7cea3..a5ea02d7a183 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -411,6 +411,9 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; if (skb_pull(skb, payload - skb->data)) { + if (inet_sk(sk)->recverr_rfc4884) + ip_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; @@ -904,6 +907,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: + case IP_RECVERR_RFC4884: if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; @@ -1063,6 +1067,11 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (!val) skb_queue_purge(&sk->sk_error_queue); break; + case IP_RECVERR_RFC4884: + if (val < 0 || val > 1) + goto e_inval; + inet->recverr_rfc4884 = !!val; + break; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -1611,6 +1620,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_RECVERR: val = inet->recverr; break; + case IP_RECVERR_RFC4884: + val = inet->recverr_rfc4884; + break; case IP_MULTICAST_TTL: val = inet->mc_ttl; break; -- cgit v1.2.3 From f65b71aa25a65e13cf3d10445a48c63d3eeb942e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Jul 2020 01:45:29 +0300 Subject: ptp: add ability to configure duty cycle for periodic output There are external event timestampers (PHCs with support for PTP_EXTTS_REQUEST) that timestamp both event edges. When those edges are very close (such as in the case of a short pulse), there is a chance that the collected timestamp might be of the rising, or of the falling edge, we never know. There are also PHCs capable of generating periodic output with a configurable duty cycle. This is good news, because we can space the rising and falling edge out enough in time, that the risks to overrun the 1-entry timestamp FIFO of the extts PHC are lower (example: the perout PHC can be configured for a period of 1 second, and an "on" time of 0.5 seconds, resulting in a duty cycle of 50%). A flag is introduced for signaling that an on time is present in the perout request structure, for preserving compatibility. Logically speaking, the duty cycle cannot exceed 100% and the PTP core checks for this. PHC drivers that don't support this flag emit a periodic output of an unspecified duty cycle, same as before. The duty cycle is encoded as an "on" time, similar to the "start" and "period" times, and reuses the reserved space while preserving overall binary layout. Pahole reported before: struct ptp_perout_request { struct ptp_clock_time start; /* 0 16 */ struct ptp_clock_time period; /* 16 16 */ unsigned int index; /* 32 4 */ unsigned int flags; /* 36 4 */ unsigned int rsv[4]; /* 40 16 */ /* size: 56, cachelines: 1, members: 5 */ /* last cacheline: 56 bytes */ }; And now: struct ptp_perout_request { struct ptp_clock_time start; /* 0 16 */ struct ptp_clock_time period; /* 16 16 */ unsigned int index; /* 32 4 */ unsigned int flags; /* 36 4 */ union { struct ptp_clock_time on; /* 40 16 */ unsigned int rsv[4]; /* 40 16 */ }; /* 40 16 */ /* size: 56, cachelines: 1, members: 5 */ /* last cacheline: 56 bytes */ }; Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 33 +++++++++++++++++++++++++++------ include/uapi/linux/ptp_clock.h | 17 ++++++++++++++--- 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 375cd6e4aade..e0e6f85966e1 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -191,12 +191,33 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EFAULT; break; } - if (((req.perout.flags & ~PTP_PEROUT_VALID_FLAGS) || - req.perout.rsv[0] || req.perout.rsv[1] || - req.perout.rsv[2] || req.perout.rsv[3]) && - cmd == PTP_PEROUT_REQUEST2) { - err = -EINVAL; - break; + if (cmd == PTP_PEROUT_REQUEST2) { + struct ptp_perout_request *perout = &req.perout; + + if (perout->flags & ~PTP_PEROUT_VALID_FLAGS) { + err = -EINVAL; + break; + } + /* + * The "on" field has undefined meaning if + * PTP_PEROUT_DUTY_CYCLE isn't set, we must still treat + * it as reserved, which must be set to zero. + */ + if (!(perout->flags & PTP_PEROUT_DUTY_CYCLE) && + (perout->rsv[0] || perout->rsv[1] || + perout->rsv[2] || perout->rsv[3])) { + err = -EINVAL; + break; + } + if (perout->flags & PTP_PEROUT_DUTY_CYCLE) { + /* The duty cycle must be subunitary. */ + if (perout->on.sec > perout->period.sec || + (perout->on.sec == perout->period.sec && + perout->on.nsec > perout->period.nsec)) { + err = -ERANGE; + break; + } + } } else if (cmd == PTP_PEROUT_REQUEST) { req.perout.flags &= PTP_PEROUT_V1_VALID_FLAGS; req.perout.rsv[0] = 0; diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index ff070aa64278..1d2841155f7d 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -53,12 +53,14 @@ /* * Bits of the ptp_perout_request.flags field: */ -#define PTP_PEROUT_ONE_SHOT (1<<0) +#define PTP_PEROUT_ONE_SHOT (1<<0) +#define PTP_PEROUT_DUTY_CYCLE (1<<1) /* * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. */ -#define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT) +#define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT | \ + PTP_PEROUT_DUTY_CYCLE) /* * No flags are valid for the original PTP_PEROUT_REQUEST ioctl @@ -105,7 +107,16 @@ struct ptp_perout_request { struct ptp_clock_time period; /* Desired period, zero means disable. */ unsigned int index; /* Which channel to configure. */ unsigned int flags; - unsigned int rsv[4]; /* Reserved for future use. */ + union { + /* + * The "on" time of the signal. + * Must be lower than the period. + * Valid only if (flags & PTP_PEROUT_DUTY_CYCLE) is set. + */ + struct ptp_clock_time on; + /* Reserved for future use. */ + unsigned int rsv[4]; + }; }; #define PTP_MAX_SAMPLES 25 /* Maximum allowed offset measurement samples. */ -- cgit v1.2.3 From b6bd41363a1ca39282496803cc32f7515ed917fe Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 17 Jul 2020 01:45:30 +0300 Subject: ptp: introduce a phase offset in the periodic output request Some PHCs like the ocelot/felix switch cannot emit generic periodic output, but just PPS (pulse per second) signals, which: - don't start from arbitrary absolute times, but are rather phase-aligned to the beginning of [the closest next] second. - have an optional phase offset relative to that beginning of the second. For those, it was initially established that they should reject any other absolute time for the PTP_PEROUT_REQUEST than 0.000000000 [1]. But when it actually came to writing an application [2] that makes use of this functionality, we realized that we can't really deal generically with PHCs that support absolute start time, and with PHCs that don't, without an explicit interface. Namely, in an ideal world, PHC drivers would ensure that the "perout.start" value written to hardware will result in a functional output. This means that if the PTP time has become in the past of this PHC's current time, it should be automatically fast-forwarded by the driver into a close enough future time that is known to work (note: this is necessary only if the hardware doesn't do this fast-forward by itself). But we don't really know what is the status for PHC drivers in use today, so in the general sense, user space would be risking to have a non-functional periodic output if it simply asked for a start time of 0.000000000. So let's introduce a flag for this type of reduced-functionality hardware, named PTP_PEROUT_PHASE. The start time is just "soon", the only thing we know for sure about this signal is that its rising edge events, Rn, occur at: Rn = perout.phase + n * perout.period The "phase" in the periodic output structure is simply an alias to the "start" time, since both cannot logically be specified at the same time. Therefore, the binary layout of the structure is not affected. [1]: https://patchwork.ozlabs.org/project/netdev/patch/20200320103726.32559-7-yangbo.lu@nxp.com/ [2]: https://www.mail-archive.com/linuxptp-devel@lists.sourceforge.net/msg04142.html Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/uapi/linux/ptp_clock.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 1d2841155f7d..1d108d597f66 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -55,12 +55,14 @@ */ #define PTP_PEROUT_ONE_SHOT (1<<0) #define PTP_PEROUT_DUTY_CYCLE (1<<1) +#define PTP_PEROUT_PHASE (1<<2) /* * flag fields valid for the new PTP_PEROUT_REQUEST2 ioctl. */ #define PTP_PEROUT_VALID_FLAGS (PTP_PEROUT_ONE_SHOT | \ - PTP_PEROUT_DUTY_CYCLE) + PTP_PEROUT_DUTY_CYCLE | \ + PTP_PEROUT_PHASE) /* * No flags are valid for the original PTP_PEROUT_REQUEST ioctl @@ -103,7 +105,20 @@ struct ptp_extts_request { }; struct ptp_perout_request { - struct ptp_clock_time start; /* Absolute start time. */ + union { + /* + * Absolute start time. + * Valid only if (flags & PTP_PEROUT_PHASE) is unset. + */ + struct ptp_clock_time start; + /* + * Phase offset. The signal should start toggling at an + * unspecified integer multiple of the period, plus this value. + * The start time should be "as soon as possible". + * Valid only if (flags & PTP_PEROUT_PHASE) is set. + */ + struct ptp_clock_time phase; + }; struct ptp_clock_time period; /* Desired period, zero means disable. */ unsigned int index; /* Which channel to configure. */ unsigned int flags; -- cgit v1.2.3 From 4787dd582dbde0b7f29eb3dbe59df3da1b350925 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Fri, 17 Jul 2020 08:05:12 +0530 Subject: bareudp: Reverted support to enable & disable rx metadata collection The commit fe80536acf83 ("bareudp: Added attribute to enable & disable rx metadata collection") breaks the the original(5.7) default behavior of bareudp module to collect RX metadadata at the receive. It was added to avoid the crash at the kernel neighbour subsytem when packet with metadata from bareudp is processed. But it is no more needed as the commit 394de110a733 ("net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb") solves this crash. Fixes: fe80536acf83 ("bareudp: Added attribute to enable & disable rx metadata collection") Signed-off-by: Martin Varghese Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- Documentation/networking/bareudp.rst | 6 ++---- drivers/net/bareudp.c | 21 +++++---------------- include/net/bareudp.h | 1 - include/uapi/linux/if_link.h | 1 - 4 files changed, 7 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/bareudp.rst b/Documentation/networking/bareudp.rst index 0e00636d8d74..465a8b251bfe 100644 --- a/Documentation/networking/bareudp.rst +++ b/Documentation/networking/bareudp.rst @@ -48,7 +48,5 @@ enabled. The bareudp device could be used along with OVS or flower filter in TC. The OVS or TC flower layer must set the tunnel information in SKB dst field before sending packet buffer to the bareudp device for transmission. On reception the -bareudp device decapsulates the udp header and passes the inner packet to the -network stack. If RX_COLLECT_METADATA flag is enabled in the device the tunnel -information will be stored in the SKB dst field before the packet buffer is -passed to the network stack. +bareudp device extracts and stores the tunnel information in SKB dst field before +passing the packet buffer to the network stack. diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 108a8cafc4f8..44eb2b1d0416 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -46,7 +46,6 @@ struct bareudp_dev { __be16 port; u16 sport_min; bool multi_proto_mode; - bool rx_collect_metadata; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; @@ -126,14 +125,12 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) bareudp->dev->stats.rx_dropped++; goto drop; } - if (bareudp->rx_collect_metadata) { - tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); - if (!tun_dst) { - bareudp->dev->stats.rx_dropped++; - goto drop; - } - skb_dst_set(skb, &tun_dst->dst); + tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0); + if (!tun_dst) { + bareudp->dev->stats.rx_dropped++; + goto drop; } + skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -577,9 +574,6 @@ static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; - if (data[IFLA_BAREUDP_RX_COLLECT_METADATA]) - conf->rx_collect_metadata = true; - return 0; } @@ -617,7 +611,6 @@ static int bareudp_configure(struct net *net, struct net_device *dev, bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; - bareudp->rx_collect_metadata = conf->rx_collect_metadata; err = register_netdevice(dev); if (err) @@ -676,7 +669,6 @@ static size_t bareudp_get_size(const struct net_device *dev) nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ - nla_total_size(0) + /* IFLA_BAREUDP_RX_COLLECT_METADATA */ 0; } @@ -693,9 +685,6 @@ static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; - if (bareudp->rx_collect_metadata && - nla_put_flag(skb, IFLA_BAREUDP_RX_COLLECT_METADATA)) - goto nla_put_failure; return 0; diff --git a/include/net/bareudp.h b/include/net/bareudp.h index 3dd5f9a8d01c..dc65a0d71d9b 100644 --- a/include/net/bareudp.h +++ b/include/net/bareudp.h @@ -12,7 +12,6 @@ struct bareudp_conf { __be16 port; u16 sport_min; bool multi_proto_mode; - bool rx_collect_metadata; }; struct net_device *bareudp_dev_create(struct net *net, const char *name, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 26842ffd0501..af8f31987526 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -601,7 +601,6 @@ enum { IFLA_BAREUDP_ETHERTYPE, IFLA_BAREUDP_SRCPORT_MIN, IFLA_BAREUDP_MULTIPROTO_MODE, - IFLA_BAREUDP_RX_COLLECT_METADATA, __IFLA_BAREUDP_MAX }; -- cgit v1.2.3 From 5923b8f7fa218a9bccd730c0a9692635eb2fc740 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Thu, 23 Jul 2020 01:03:01 +0300 Subject: net/sched: cls_flower: Add hash info to flow classification Adding new cls flower keys for hash value and hash mask and dissect the hash info from the skb into the flow key towards flow classication. Signed-off-by: Ariel Levkovich Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +++ net/sched/cls_flower.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7576209d96f9..ee95f42fb0ec 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -578,6 +578,9 @@ enum { TCA_FLOWER_KEY_MPLS_OPTS, + TCA_FLOWER_KEY_HASH, /* u32 */ + TCA_FLOWER_KEY_HASH_MASK, /* u32 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index acd8e05c2ba5..a4f7ef1de7e7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -64,6 +64,7 @@ struct fl_flow_key { }; } tp_range; struct flow_dissector_key_ct ct; + struct flow_dissector_key_hash hash; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -318,6 +319,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map)); + skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); f = fl_mask_lookup(mask, &skb_key); @@ -695,6 +697,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_HASH_MASK] = { .type = NLA_U32 }, + }; static const struct nla_policy @@ -1626,6 +1631,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + fl_set_key_val(tb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash)); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { ret = fl_set_enc_opt(tb, key, mask, extack); if (ret) @@ -1740,6 +1749,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CT, ct); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_HASH, hash); skb_flow_dissector_init(dissector, keys, cnt); } @@ -2960,6 +2971,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + if (fl_dump_key_val(skb, &key->hash.hash, TCA_FLOWER_KEY_HASH, + &mask->hash.hash, TCA_FLOWER_KEY_HASH_MASK, + sizeof(key->hash.hash))) + goto nla_put_failure; + return 0; nla_put_failure: -- cgit v1.2.3 From 01370434df85eb76ecb1527a4466013c4aca2436 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 24 Jul 2020 09:03:10 -0400 Subject: icmp6: support rfc 4884 Extend the rfc 4884 read interface introduced for ipv4 in commit eba75c587e81 ("icmp: support rfc 4884") to ipv6. Add socket option SOL_IPV6/IPV6_RECVERR_RFC4884. Changes v1->v2: - make ipv6_icmp_error_rfc4884 static (file scope) Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/icmpv6.h | 1 + include/uapi/linux/in6.h | 1 + net/ipv4/icmp.c | 1 + net/ipv6/datagram.c | 16 ++++++++++++++++ net/ipv6/ipv6_sockglue.c | 12 ++++++++++++ 6 files changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8d8f877e7f81..a44789d027cc 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -283,6 +283,7 @@ struct ipv6_pinfo { autoflowlabel:1, autoflowlabel_set:1, mc_all:1, + recverr_rfc4884:1, rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 2622b5a3e616..c1661febc2dc 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -68,6 +68,7 @@ struct icmp6hdr { #define icmp6_mtu icmp6_dataun.un_data32[0] #define icmp6_unused icmp6_dataun.un_data32[0] #define icmp6_maxdelay icmp6_dataun.un_data16[0] +#define icmp6_datagram_len icmp6_dataun.un_data8[0] #define icmp6_router icmp6_dataun.u_nd_advt.router #define icmp6_solicited icmp6_dataun.u_nd_advt.solicited #define icmp6_override icmp6_dataun.u_nd_advt.override diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 9f2273a08356..5ad396a57eb3 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -179,6 +179,7 @@ struct in6_flowlabel_req { #define IPV6_LEAVE_ANYCAST 28 #define IPV6_MULTICAST_ALL 29 #define IPV6_ROUTER_ALERT_ISOLATE 30 +#define IPV6_RECVERR_RFC4884 31 /* IPV6_MTU_DISCOVER values */ #define IPV6_PMTUDISC_DONT 0 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7498c58460a1..cf36f955bfe6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1173,6 +1173,7 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb, if (!ip_icmp_error_rfc4884_validate(skb, off)) out->flags |= SO_EE_RFC4884_FLAG_INVALID; } +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); int icmp_err(struct sk_buff *skb, u32 info) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 390bedde21a5..cc8ad7ddecda 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -284,6 +285,17 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); +static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out) +{ + switch (icmp6_hdr(skb)->icmp6_type) { + case ICMPV6_TIME_EXCEED: + case ICMPV6_DEST_UNREACH: + ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), + icmp6_hdr(skb)->icmp6_datagram_len * 8); + } +} + void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { @@ -313,6 +325,10 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, serr->port = port; __skb_pull(skb, payload - skb->data); + + if (inet6_sk(sk)->recverr_rfc4884) + ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); + skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d2282f5c9760..20c740976334 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -965,6 +965,14 @@ done: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; + case IPV6_RECVERR_RFC4884: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 1) + goto e_inval; + np->recverr_rfc4884 = valbool; + retv = 0; + break; } release_sock(sk); @@ -1439,6 +1447,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rtalert_isolate; break; + case IPV6_RECVERR_RFC4884: + val = np->recverr_rfc4884; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 23 Jul 2020 11:41:12 -0700 Subject: bpf: Implement bpf iterator for map elements The bpf iterator for map elements are implemented. The bpf program will receive four parameters: bpf_iter_meta *meta: the meta data bpf_map *map: the bpf_map whose elements are traversed void *key: the key of one element void *value: the value of the same element Here, meta and map pointers are always valid, and key has register type PTR_TO_RDONLY_BUF_OR_NULL and value has register type PTR_TO_RDWR_BUF_OR_NULL. The kernel will track the access range of key and value during verification time. Later, these values will be compared against the values in the actual map to ensure all accesses are within range. A new field iter_seq_info is added to bpf_map_ops which is used to add map type specific information, i.e., seq_ops, init/fini seq_file func and seq_file private data size. Subsequent patches will have actual implementation for bpf_map_ops->iter_seq_info. In user space, BPF_ITER_LINK_MAP_FD needs to be specified in prog attr->link_create.flags, which indicates that attr->link_create.target_fd is a map_fd. The reason for such an explicit flag is for possible future cases where one bpf iterator may allow more than one possible customization, e.g., pid and cgroup id for task_file. Current kernel internal implementation only allows the target to register at most one required bpf_iter_link_info. To support the above case, optional bpf_iter_link_info's are needed, the target can be extended to register such link infos, and user provided link_info needs to match one of target supported ones. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com --- include/linux/bpf.h | 16 ++++++++ include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++-------- kernel/bpf/map_iter.c | 30 ++++++++++++++- tools/include/uapi/linux/bpf.h | 7 ++++ 5 files changed, 128 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f9c4bb08f616..4175cf1f4665 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -107,6 +107,9 @@ struct bpf_map_ops { /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; + + /* bpf_iter info used to open a seq_file */ + const struct bpf_iter_seq_info *iter_seq_info; }; struct bpf_map_memory { @@ -1207,12 +1210,18 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + struct bpf_map *map; }; +typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux); + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; + bpf_iter_check_target_t check_target; u32 ctx_arg_info_size; + enum bpf_iter_link_info req_linfo; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; @@ -1223,6 +1232,13 @@ struct bpf_iter_meta { u64 seq_num; }; +struct bpf_iter__bpf_map_elem { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); + __bpf_md_ptr(void *, key); + __bpf_md_ptr(void *, value); +}; + int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info); void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info); bool bpf_iter_prog_supported(struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8fa94cb1b5a0..363b9cafc2d8 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -14,11 +14,13 @@ struct bpf_iter_target_info { struct bpf_iter_link { struct bpf_link link; + struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; + const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; @@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -199,11 +202,25 @@ done: return copied; } +static const struct bpf_iter_seq_info * +__get_seq_info(struct bpf_iter_link *link) +{ + const struct bpf_iter_seq_info *seq_info; + + if (link->aux.map) { + seq_info = link->aux.map->ops->iter_seq_info; + if (seq_info) + return seq_info; + } + + return link->tinfo->reg_info->seq_info; +} + static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link); + return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) @@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private) - iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private); + if (iter_priv->seq_info->fini_seq_private) + iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) static void bpf_iter_link_release(struct bpf_link *link) { + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + if (iter_link->aux.map) + bpf_map_put_with_uref(iter_link->aux.map); } static void bpf_iter_link_dealloc(struct bpf_link *link) @@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_iter_target_info *tinfo; + struct bpf_iter_aux_info aux = {}; struct bpf_iter_link *link; + u32 prog_btf_id, target_fd; bool existed = false; - u32 prog_btf_id; + struct bpf_map *map; int err; - if (attr->link_create.target_fd || attr->link_create.flags) - return -EINVAL; - prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { @@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!existed) return -ENOENT; + /* Make sure user supplied flags are target expected. */ + target_fd = attr->link_create.target_fd; + if (attr->link_create.flags != tinfo->reg_info->req_linfo) + return -EINVAL; + if (!attr->link_create.flags && target_fd) + return -EINVAL; + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) { + map = bpf_map_get_with_uref(target_fd); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto cleanup_link; + } + + aux.map = map; + err = tinfo->reg_info->check_target(prog, &aux); + if (err) { + bpf_map_put_with_uref(map); + goto cleanup_link; + } + + link->aux.map = map; + } + return bpf_link_settle(&link_primer); + +cleanup_link: + bpf_link_cleanup(&link_primer); + return err; } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, + const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; + priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, + const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; @@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->reg_info->seq_info->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops, + seq_info->seq_priv_size; + priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->reg_info->seq_info->init_seq_private) { - err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL); + if (seq_info->init_seq_private) { + err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } - init_seq_meta(priv_data, tinfo, prog); + init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; @@ -463,6 +515,7 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { + struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; @@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link) goto free_fd; } - err = prepare_seq_file(file, - container_of(link, struct bpf_iter_link, link)); + iter_link = container_of(link, struct bpf_iter_link, link); + err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 1a69241fb1e2..8a1f9b3355d0 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = { .seq_info = &bpf_map_seq_info, }; +static int bpf_iter_check_map(struct bpf_prog *prog, + struct bpf_iter_aux_info *aux) +{ + return -EINVAL; +} + +DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, + struct bpf_map *map, void *key, void *value) + +static const struct bpf_iter_reg bpf_map_elem_reg_info = { + .target = "bpf_map_elem", + .check_target = bpf_iter_check_map, + .req_linfo = BPF_ITER_LINK_MAP_FD, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map_elem, key), + PTR_TO_RDONLY_BUF_OR_NULL }, + { offsetof(struct bpf_iter__bpf_map_elem, value), + PTR_TO_RDWR_BUF_OR_NULL }, + }, +}; + static int __init bpf_map_iter_init(void) { + int ret; + bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id; - return bpf_iter_reg_target(&bpf_map_reg_info); + ret = bpf_iter_reg_target(&bpf_map_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&bpf_map_elem_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54d0c886e3ba..828c2f6438f2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -246,6 +246,13 @@ enum bpf_link_type { MAX_BPF_LINK_TYPE, }; +enum bpf_iter_link_info { + BPF_ITER_LINK_UNSPEC = 0, + BPF_ITER_LINK_MAP_FD = 1, + + MAX_BPF_ITER_LINK_INFO, +}; + /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command * * NONE(default): No further bpf programs allowed in the subtree. -- cgit v1.2.3 From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:57 -0700 Subject: bpf, xdp: Add bpf_link-based XDP attachment API Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through BPF_LINK_CREATE command. bpf_xdp_link is mutually exclusive with direct BPF program attachment, previous BPF program should be detached prior to attempting to create a new bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it can't be replaced by other BPF program attachment or link attachment. It will be detached only when the last BPF link FD is closed. bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to how other BPF links behave (cgroup, flow_dissector). At that point bpf_link will become defunct, but won't be destroyed until last FD is closed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com --- include/linux/netdevice.h | 4 ++ include/uapi/linux/bpf.h | 7 +- kernel/bpf/syscall.c | 5 ++ net/core/dev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 178 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cad44b40c776..7d3c412fcfe5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -888,6 +888,7 @@ struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; +struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, @@ -898,6 +899,7 @@ enum bpf_xdp_mode { struct bpf_xdp_entity { struct bpf_prog *prog; + struct bpf_xdp_link *link; }; struct netdev_bpf { @@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); + int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 828c2f6438f2..87823fb9c123 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,7 @@ enum bpf_attach_type { BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, BPF_SK_LOOKUP, + BPF_XDP, __MAX_BPF_ATTACH_TYPE }; @@ -242,6 +243,7 @@ enum bpf_link_type { BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, MAX_BPF_LINK_TYPE, }; @@ -614,7 +616,10 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ - __u32 target_fd; /* object to attach to */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee290b1f2d9e..0e8c88db7e7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_TRACING; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/net/core/dev.c b/net/core/dev.c index 521ce031ee35..e24248f3d675 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; + static enum bpf_xdp_mode dev_xdp_mode(u32 flags) { if (flags & XDP_FLAGS_HW_MODE) @@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) }; } +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; return dev->xdp_state[mode].prog; } @@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) return prog ? prog->aux->id : 0; } +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { + dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } @@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, static void dev_xdp_uninstall(struct net_device *dev) { + struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev) WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); - bpf_prog_put(prog); - dev_xdp_set_prog(dev, mode, NULL); + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, - struct bpf_prog *new_prog, struct bpf_prog *old_prog, - u32 flags) + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; @@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack ASSERT_RTNL(); + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } /* just one XDP mode bit should be set, zero defaults to SKB mode */ if (hweight32(flags & XDP_FLAGS_MODES) > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); @@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack } mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; @@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; + if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB @@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return err; } - dev_xdp_set_prog(dev, mode, new_prog); + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + + rtnl_unlock(); +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } } - err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) -- cgit v1.2.3 From c1931c9784ebb5787c0784c112fb8baa5e8455b3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:59 -0700 Subject: bpf: Implement BPF XDP link-specific introspection APIs Implement XDP link-specific show_fdinfo and link_info to emit ifindex. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-7-andriin@fb.com --- include/uapi/linux/bpf.h | 3 +++ net/core/dev.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 87823fb9c123..e1ba4ae6a916 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4069,6 +4069,9 @@ struct bpf_link_info { __u32 netns_ino; __u32 attach_type; } netns; + struct { + __u32 ifindex; + } xdp; }; } __attribute__((aligned(8))); diff --git a/net/core/dev.c b/net/core/dev.c index 49f284f51a22..82ce0920b172 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8996,6 +8996,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link) kfree(xdp_link); } +static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); +} + +static int bpf_xdp_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + u32 ifindex = 0; + + rtnl_lock(); + if (xdp_link->dev) + ifindex = xdp_link->dev->ifindex; + rtnl_unlock(); + + info->xdp.ifindex = ifindex; + return 0; +} + static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { @@ -9041,6 +9070,8 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .show_fdinfo = bpf_xdp_link_show_fdinfo, + .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, }; -- cgit v1.2.3 From 8f4c0e01789c18674acdf17cae3822b3dc3db715 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Wed, 22 Jul 2020 10:40:16 -0400 Subject: hsr: enhance netlink socket interface to support PRP Parallel Redundancy Protocol (PRP) is another redundancy protocol introduced by IEC 63439 standard. It is similar to HSR in many aspects:- - Use a pair of Ethernet interfaces to created the PRP device - Use a 6 byte redundancy protocol part (RCT, Redundancy Check Trailer) similar to HSR Tag. - Has Link Redundancy Entity (LRE) that works with RCT to implement redundancy. Key difference is that the protocol unit is a trailer instead of a prefix as in HSR. That makes it inter-operable with tradition network components such as bridges/switches which treat it as pad bytes, whereas HSR nodes requires some kind of translators (Called redbox) to talk to regular network devices. This features allows regular linux box to be converted to a DAN-P box. DAN-P stands for Dual Attached Node - PRP similar to DAN-H (Dual Attached Node - HSR). Add a comment at the header/source code to explicitly state that the driver files also handles PRP protocol as well. Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- include/uapi/linux/hsr_netlink.h | 2 +- include/uapi/linux/if_link.h | 12 +++++++++++- net/hsr/Kconfig | 35 +++++++++++++++++++++++------------ net/hsr/hsr_debugfs.c | 2 +- net/hsr/hsr_device.c | 7 +++++-- net/hsr/hsr_device.h | 2 ++ net/hsr/hsr_forward.c | 2 ++ net/hsr/hsr_forward.h | 2 ++ net/hsr/hsr_framereg.c | 1 + net/hsr/hsr_framereg.h | 2 ++ net/hsr/hsr_main.c | 2 ++ net/hsr/hsr_main.h | 11 ++++++++++- net/hsr/hsr_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/hsr/hsr_netlink.h | 2 ++ net/hsr/hsr_slave.c | 2 ++ net/hsr/hsr_slave.h | 2 ++ 16 files changed, 99 insertions(+), 25 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/hsr_netlink.h b/include/uapi/linux/hsr_netlink.h index c218ef9c35dd..d540ea9bbef4 100644 --- a/include/uapi/linux/hsr_netlink.h +++ b/include/uapi/linux/hsr_netlink.h @@ -17,7 +17,7 @@ /* Generic Netlink HSR family definition */ -/* attributes */ +/* attributes for HSR or PRP node */ enum { HSR_A_UNSPEC, HSR_A_NODE_ADDR, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index af8f31987526..63af64646358 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -907,7 +907,14 @@ enum { #define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) -/* HSR section */ +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; enum { IFLA_HSR_UNSPEC, @@ -917,6 +924,9 @@ enum { IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ IFLA_HSR_SEQ_NR, IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ __IFLA_HSR_MAX, }; diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 8095b034e76e..1b048c17b6c8 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -4,24 +4,35 @@ # config HSR - tristate "High-availability Seamless Redundancy (HSR)" + tristate "High-availability Seamless Redundancy (HSR & PRP)" help + This enables IEC 62439 defined High-availability Seamless + Redundancy (HSR) and Parallel Redundancy Protocol (PRP). + If you say Y here, then your Linux box will be able to act as a - DANH ("Doubly attached node implementing HSR"). For this to work, - your Linux box needs (at least) two physical Ethernet interfaces, - and it must be connected as a node in a ring network together with - other HSR capable nodes. + DANH ("Doubly attached node implementing HSR") or DANP ("Doubly + attached node implementing PRP"). For this to work, your Linux box + needs (at least) two physical Ethernet interfaces. + + For DANH, it must be connected as a node in a ring network together + with other HSR capable nodes. All Ethernet frames sent over the HSR + device will be sent in both directions on the ring (over both slave + ports), giving a redundant, instant fail-over network. Each HSR node + in the ring acts like a bridge for HSR frames, but filters frames + that have been forwarded earlier. - All Ethernet frames sent over the hsr device will be sent in both - directions on the ring (over both slave ports), giving a redundant, - instant fail-over network. Each HSR node in the ring acts like a - bridge for HSR frames, but filters frames that have been forwarded - earlier. + For DANP, it must be connected as a node connecting to two + separate networks over the two slave interfaces. Like HSR, Ethernet + frames sent over the PRP device will be sent to both networks giving + a redundant, instant fail-over network. Unlike HSR, PRP networks + can have Singly Attached Nodes (SAN) such as PC, printer, bridges + etc and will be able to communicate with DANP nodes. This code is a "best effort" to comply with the HSR standard as described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1), - but no compliancy tests have been made. Use iproute2 to select - the version you desire. + and PRP standard described in IEC 62439-4:2012 (PRP), but no + compliancy tests have been made. Use iproute2 to select the protocol + you would like to use. You need to perform any and all necessary tests yourself before relying on this code in a safety critical system! diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 9787ef11ca71..c1932c0a15be 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -1,5 +1,5 @@ /* - * hsr_debugfs code + * debugfs code for HSR & PRP * Copyright (C) 2019 Texas Instruments Incorporated * * Author(s): diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 8a927b647829..40ac45123a62 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -3,9 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se - * * This file contains device methods for creating, using and destroying - * virtual HSR devices. + * virtual HSR or PRP devices. */ #include @@ -427,6 +426,10 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + /* currently PRP is not supported */ + if (protocol_version == PRP_V1) + return -EPROTONOSUPPORT; + /* Make sure we recognize frames from ourselves in hsr_rcv() */ res = hsr_create_self_node(hsr, hsr_dev->dev_addr, slave[1]->dev_addr); diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index b8f9262ed101..868373822ee4 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_DEVICE_H diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ab8dca0c0b65..55adb4dbd235 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame router for HSR and PRP. */ #include "hsr_forward.h" diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h index 51a69295566c..b2a6fa319d94 100644 --- a/net/hsr/hsr_forward.h +++ b/net/hsr/hsr_forward.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FORWARD_H diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 530de24b1fb5..13b2190e6556 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -8,6 +8,7 @@ * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. + * Same code handles filtering of duplicates for PRP as well. */ #include diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 0f0fa12b4329..c06447780d05 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_FRAMEREG_H diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 144da15f0a81..2fd1976e5b1c 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Event handling for HSR and PRP devices. */ #include diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index f74193465bf5..8cf10d67d5f9 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_PRIVATE_H @@ -131,6 +133,13 @@ struct hsr_port { enum hsr_port_type type; }; +/* used by driver internally to differentiate various protocols */ +enum hsr_version { + HSR_V0 = 0, + HSR_V1, + PRP_V1, +}; + struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; @@ -141,7 +150,7 @@ struct hsr_priv { int announce_count; u16 sequence_nr; u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ - u8 prot_version; /* Indicate if HSRv0 or HSRv1. */ + enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ unsigned char sup_multicast_addr[ETH_ALEN]; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 6e14b7d22639..06c3cd988760 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -4,7 +4,7 @@ * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * - * Routines for handling Netlink messages for HSR. + * Routines for handling Netlink messages for HSR and PRP. */ #include "hsr_netlink.h" @@ -22,6 +22,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_VERSION] = { .type = NLA_U8 }, [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, + [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 }, }; /* Here, it seems a netdevice has already been allocated for us, and the @@ -31,8 +32,10 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + enum hsr_version proto_version; + unsigned char multicast_spec; + u8 proto = HSR_PROTOCOL_HSR; struct net_device *link[2]; - unsigned char multicast_spec, hsr_version; if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); @@ -69,18 +72,34 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); + if (data[IFLA_HSR_PROTOCOL]) + proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]); + + if (proto >= HSR_PROTOCOL_MAX) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol\n"); + return -EINVAL; + } + if (!data[IFLA_HSR_VERSION]) { - hsr_version = 0; + proto_version = HSR_V0; } else { - hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); - if (hsr_version > 1) { + if (proto == HSR_PROTOCOL_PRP) { + NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported\n"); + return -EINVAL; + } + + proto_version = nla_get_u8(data[IFLA_HSR_VERSION]); + if (proto_version > HSR_V1) { NL_SET_ERR_MSG_MOD(extack, - "Only versions 0..1 are supported"); + "Only HSR version 0/1 supported\n"); return -EINVAL; } } - return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); + if (proto == HSR_PROTOCOL_PRP) + proto_version = PRP_V1; + + return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack); } static void hsr_dellink(struct net_device *dev, struct list_head *head) @@ -102,6 +121,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct hsr_priv *hsr = netdev_priv(dev); + u8 proto = HSR_PROTOCOL_HSR; struct hsr_port *port; port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); @@ -120,6 +140,10 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; + if (hsr->prot_version == PRP_V1) + proto = HSR_PROTOCOL_PRP; + if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) + goto nla_put_failure; return 0; diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h index 1121bb192a18..501552d9753b 100644 --- a/net/hsr/hsr_netlink.h +++ b/net/hsr/hsr_netlink.h @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_NETLINK_H diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 25b6ffba26cd..b5c0834de338 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -3,6 +3,8 @@ * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h index 8953ea279ce9..9708a4f0ec09 100644 --- a/net/hsr/hsr_slave.h +++ b/net/hsr/hsr_slave.h @@ -2,6 +2,8 @@ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se + * + * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H -- cgit v1.2.3 From e1613b5714ee6c186c9628e9958edf65e9d9cddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 27 Jul 2020 15:47:15 -0700 Subject: bpf: Fix bpf_ringbuf_output() signature to return long Due to bpf tree fix merge, bpf_ringbuf_output() signature ended up with int as a return type, while all other helpers got converted to returning long. So fix it in bpf-next now. Fixes: b0659d8a950d ("bpf: Fix definition of bpf_ringbuf_output() helper in UAPI comments") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200727224715.652037-1-andriin@fb.com --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1ba4ae6a916..eb5e0c38eb2c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3241,7 +3241,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e1ba4ae6a916..eb5e0c38eb2c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3241,7 +3241,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification -- cgit v1.2.3 From 50935339c394adfb3d7253055e3bc10ee70264b0 Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Sat, 25 Jul 2020 19:02:25 +0200 Subject: netfilter: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connmark.h | 2 +- net/decnet/netfilter/dn_rtmsg.c | 2 +- net/netfilter/Kconfig | 2 +- net/netfilter/nfnetlink_acct.c | 2 +- net/netfilter/nft_set_pipapo.c | 4 ++-- net/netfilter/xt_CONNSECMARK.c | 2 +- net/netfilter/xt_connmark.c | 2 +- net/netfilter/xt_nfacct.c | 2 +- net/netfilter/xt_time.c | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h index 1aa5c955ee1e..f01c19b83a2b 100644 --- a/include/uapi/linux/netfilter/xt_connmark.h +++ b/include/uapi/linux/netfilter/xt_connmark.h @@ -4,7 +4,7 @@ #include -/* Copyright (C) 2002,2004 MARA Systems AB +/* Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * * This program is free software; you can redistribute it and/or modify diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index dc705769acc9..26a9193df783 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -6,7 +6,7 @@ * * DECnet Routing Message Grabulator * - * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * (C) 2000 ChyGwyn Limited - https://www.chygwyn.com/ * * Author: Steven Whitehouse */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0ffe2b8723c4..25313c29d799 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -447,7 +447,7 @@ config NF_TABLES replace the existing {ip,ip6,arp,eb}_tables infrastructure. It provides a pseudo-state machine with an extensible instruction-set (also known as expressions) that the userspace 'nft' utility - (http://www.netfilter.org/projects/nftables) uses to build the + (https://www.netfilter.org/projects/nftables) uses to build the rule-set. It also comes with the generic set infrastructure that allows you to construct mappings between matchings and actions for performance lookups. diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5827117f2635..5bfec829c12f 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso - * (C) 2011 Intra2net AG + * (C) 2011 Intra2net AG */ #include #include diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index cc6082a5f7ad..9944523f5c2c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -312,7 +312,7 @@ * Jay Ligatti, Josh Kuhn, and Chris Gage. * Proceedings of the IEEE International Conference on Computer * Communication Networks (ICCCN), August 2010. - * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf * * [Rottenstreich 2010] * Worst-Case TCAM Rule Expansion @@ -325,7 +325,7 @@ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, * and Patrick Eugster. * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. - * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf */ #include diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index a5c8b653476a..76acecf3e757 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -6,7 +6,7 @@ * with the SECMARK target and state match. * * Based somewhat on CONNMARK: - * Copyright (C) 2002,2004 MARA Systems AB + * Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * * (C) 2006,2008 Red Hat, Inc., James Morris diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index eec2f3a88d73..e5ebc0810675 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -2,7 +2,7 @@ /* * xt_connmark - Netfilter module to operate on connection marks * - * Copyright (C) 2002,2004 MARA Systems AB + * Copyright (C) 2002,2004 MARA Systems AB * by Henrik Nordstrom * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 5aab6df74e0f..a97c2259bbc8 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso - * (C) 2011 Intra2net AG + * (C) 2011 Intra2net AG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 67cb98489415..6aa12d0f54e2 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -5,7 +5,7 @@ * based on ipt_time by Fabrice MARIE * This is a module which is used for time matching * It is using some modified code from dietlibc (localtime() function) - * that you can find at http://www.fefe.de/dietlibc/ + * that you can find at https://www.fefe.de/dietlibc/ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ -- cgit v1.2.3 From df78a0c0b67de58934877aad61e0431a2bd0caf1 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 1 Jun 2020 23:22:47 -0700 Subject: nl80211: S1G band and channel definitions Gives drivers the definitions needed to advertise support for S1G bands. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200602062247.23212-1-thomas@adapt-ip.com Link: https://lore.kernel.org/r/20200731055636.795173-1-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 9 ++------- include/net/cfg80211.h | 17 +++++++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/mac80211/chan.c | 7 ++++++- net/mac80211/scan.c | 1 + net/mac80211/tx.c | 1 + net/mac80211/util.c | 5 +++++ net/wireless/chan.c | 35 +++++++++++++++++++++++++++++++++++ net/wireless/core.c | 5 +++-- net/wireless/util.c | 8 ++++++++ 10 files changed, 94 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 919d15584d4a..3c0c33a9f30c 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -568,11 +568,7 @@ chan_to_phymode(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_40: phymode = MODE_11NG_HT40; break; - case NL80211_CHAN_WIDTH_5: - case NL80211_CHAN_WIDTH_10: - case NL80211_CHAN_WIDTH_80: - case NL80211_CHAN_WIDTH_80P80: - case NL80211_CHAN_WIDTH_160: + default: phymode = MODE_UNKNOWN; break; } @@ -597,8 +593,7 @@ chan_to_phymode(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_80P80: phymode = MODE_11AC_VHT80_80; break; - case NL80211_CHAN_WIDTH_5: - case NL80211_CHAN_WIDTH_10: + default: phymode = MODE_UNKNOWN; break; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fc7e8807838d..ac6e58193426 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -417,6 +417,22 @@ struct ieee80211_edmg { enum ieee80211_edmg_bw_config bw_config; }; +/** + * struct ieee80211_sta_s1g_cap - STA's S1G capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11ah S1G capabilities for a STA. + * + * @s1g_supported: is STA an S1G STA + * @cap: S1G capabilities information + * @nss_mcs: Supported NSS MCS set + */ +struct ieee80211_sta_s1g_cap { + bool s1g; + u8 cap[10]; /* use S1G_CAPAB_ */ + u8 nss_mcs[5]; +}; + /** * struct ieee80211_supported_band - frequency band definition * @@ -448,6 +464,7 @@ struct ieee80211_supported_band { int n_bitrates; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_edmg edmg_cap; u16 n_iftype_data; const struct ieee80211_sband_iftype_data *iftype_data; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4e6339ab1fce..ad183469f9af 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4437,6 +4437,11 @@ enum nl80211_key_mode { * attribute must be provided as well * @NL80211_CHAN_WIDTH_5: 5 MHz OFDM channel * @NL80211_CHAN_WIDTH_10: 10 MHz OFDM channel + * @NL80211_CHAN_WIDTH_1: 1 MHz OFDM channel + * @NL80211_CHAN_WIDTH_2: 2 MHz OFDM channel + * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel + * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel + * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel */ enum nl80211_chan_width { NL80211_CHAN_WIDTH_20_NOHT, @@ -4447,6 +4452,11 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_160, NL80211_CHAN_WIDTH_5, NL80211_CHAN_WIDTH_10, + NL80211_CHAN_WIDTH_1, + NL80211_CHAN_WIDTH_2, + NL80211_CHAN_WIDTH_4, + NL80211_CHAN_WIDTH_8, + NL80211_CHAN_WIDTH_16, }; /** @@ -4457,11 +4467,15 @@ enum nl80211_chan_width { * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide + * @NL80211_BSS_CHAN_WIDTH_1: control channel is 1 MHz wide + * @NL80211_BSS_CHAN_WIDTH_2: control channel is 2 MHz wide */ enum nl80211_bss_scan_width { NL80211_BSS_CHAN_WIDTH_20, NL80211_BSS_CHAN_WIDTH_10, NL80211_BSS_CHAN_WIDTH_5, + NL80211_BSS_CHAN_WIDTH_1, + NL80211_BSS_CHAN_WIDTH_2, }; /** @@ -4740,6 +4754,7 @@ enum nl80211_txrate_gi { * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz) * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz) + * @NL80211_BAND_S1GHZ: around 900MHz, supported by S1G PHYs * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace * since newer kernel versions may support more bands */ @@ -4748,6 +4763,7 @@ enum nl80211_band { NL80211_BAND_5GHZ, NL80211_BAND_60GHZ, NL80211_BAND_6GHZ, + NL80211_BAND_S1GHZ, NUM_NL80211_BANDS, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e6e192f53e4e..08cf9da9c1e3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -313,9 +313,14 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, lockdep_assert_held(&local->chanctx_mtx); - /* don't optimize 5MHz, 10MHz, and radar_enabled confs */ + /* don't optimize non-20MHz based and radar_enabled confs */ if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 || ctx->conf.def.width == NL80211_CHAN_WIDTH_10 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_1 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_2 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_4 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_8 || + ctx->conf.def.width == NL80211_CHAN_WIDTH_16 || ctx->conf.radar_enabled) { ctx->conf.min_def = ctx->conf.def; return; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ad90bbe57457..8003be6dae8a 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -913,6 +913,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, case NL80211_BSS_CHAN_WIDTH_10: local->scan_chandef.width = NL80211_CHAN_WIDTH_10; break; + default: case NL80211_BSS_CHAN_WIDTH_20: /* If scanning on oper channel, use whatever channel-type * is currently in use. diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1a2941e5244f..ee30ef441f4a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -166,6 +166,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; + case NL80211_BAND_S1GHZ: case NL80211_BAND_60GHZ: /* TODO, for now fall through */ case NUM_NL80211_BANDS: diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 21c94094a699..64a83ecd0a73 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3730,6 +3730,11 @@ u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c) c->width = NL80211_CHAN_WIDTH_20_NOHT; ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; break; + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: WARN_ON_ONCE(1); diff --git a/net/wireless/chan.c b/net/wireless/chan.c index cddf92c5d09e..90f0f82cd9ca 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -153,6 +153,11 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: @@ -263,6 +268,21 @@ static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) int width; switch (c->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; @@ -911,6 +931,21 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + width = 1; + break; + case NL80211_CHAN_WIDTH_2: + width = 2; + break; + case NL80211_CHAN_WIDTH_4: + width = 4; + break; + case NL80211_CHAN_WIDTH_8: + width = 8; + break; + case NL80211_CHAN_WIDTH_16: + width = 16; + break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index c623d9bf5096..1971d7e6eb55 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -803,10 +803,11 @@ int wiphy_register(struct wiphy *wiphy) if (WARN_ON(!sband->n_channels)) return -EINVAL; /* - * on 60GHz band, there are no legacy rates, so + * on 60GHz or sub-1Ghz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != NL80211_BAND_60GHZ && + if (WARN_ON((band != NL80211_BAND_60GHZ && + band != NL80211_BAND_S1GHZ) && !sband->n_bitrates)) return -EINVAL; diff --git a/net/wireless/util.c b/net/wireless/util.c index 4d3b76f94f55..26a977343c3b 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -102,6 +102,8 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; + case NL80211_BAND_S1GHZ: + return 902000 + chan * 500; default: ; } @@ -210,6 +212,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; + case NL80211_BAND_S1GHZ: + /* Figure 9-589bd: 3 means unsupported, so != 3 means at least + * mandatory is ok. + */ + WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); + break; case NUM_NL80211_BANDS: default: WARN_ON(1); -- cgit v1.2.3 From 987021726f9f41a1daf335c57cd7b6261109cdb2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:43:21 -0700 Subject: net/wireless: nl80211.h: drop duplicate words in comments Drop doubled words in several comments. Signed-off-by: Randy Dunlap Cc: netdev@vger.kernel.org Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: Johannes Berg Link: https://lore.kernel.org/r/20200715164325.9109-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ad183469f9af..f47a7a8d0216 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -363,7 +363,7 @@ * @NL80211_CMD_SET_STATION: Set station attributes for station identified by * %NL80211_ATTR_MAC on the interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_NEW_STATION: Add a station with given attributes to the - * the interface identified by %NL80211_ATTR_IFINDEX. + * interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_STATION: Remove a station identified by %NL80211_ATTR_MAC * or, if no MAC address given, all stations, on the interface identified * by %NL80211_ATTR_IFINDEX. %NL80211_ATTR_MGMT_SUBTYPE and @@ -383,7 +383,7 @@ * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by * %NL80211_ATTR_MAC. * @NL80211_CMD_NEW_PATH: Add a mesh path with given attributes to the - * the interface identified by %NL80211_ATTR_IFINDEX. + * interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_PATH: Remove a mesh path identified by %NL80211_ATTR_MAC * or, if no MAC address given, all mesh paths, on the interface identified * by %NL80211_ATTR_IFINDEX. @@ -934,7 +934,7 @@ * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules. * * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the - * the new channel information (Channel Switch Announcement - CSA) + * new channel information (Channel Switch Announcement - CSA) * in the beacon for some time (as defined in the * %NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the * new channel. Userspace provides the new channel information (using @@ -1113,7 +1113,7 @@ * randomization may be enabled and configured by specifying the * %NL80211_ATTR_MAC and %NL80211_ATTR_MAC_MASK attributes. * If a timeout is requested, use the %NL80211_ATTR_TIMEOUT attribute. - * A u64 cookie for further %NL80211_ATTR_COOKIE use is is returned in + * A u64 cookie for further %NL80211_ATTR_COOKIE use is returned in * the netlink extended ack message. * * To cancel a measurement, close the socket that requested it. @@ -1511,7 +1511,7 @@ enum nl80211_commands { * rates as defined by IEEE 802.11 7.3.2.2 but without the length * restriction (at most %NL80211_MAX_SUPP_RATES). * @NL80211_ATTR_STA_VLAN: interface index of VLAN interface to move station - * to, or the AP interface the station was originally added to to. + * to, or the AP interface the station was originally added to. * @NL80211_ATTR_STA_INFO: information about a station, part of station info * given for %NL80211_CMD_GET_STATION, nested attribute containing * info as possible, see &enum nl80211_sta_info. @@ -2084,7 +2084,7 @@ enum nl80211_commands { * @NL80211_ATTR_STA_SUPPORTED_CHANNELS: array of supported channels. * * @NL80211_ATTR_STA_SUPPORTED_OPER_CLASSES: array of supported - * supported operating classes. + * operating classes. * * @NL80211_ATTR_HANDLE_DFS: A flag indicating whether user space * controls DFS operation in IBSS mode. If the flag is included in @@ -2395,7 +2395,7 @@ enum nl80211_commands { * nl80211_txq_stats) * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy. * The smaller of this and the memory limit is enforced. - * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the + * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory limit (in bytes) for the * TXQ queues for this phy. The smaller of this and the packet limit is * enforced. * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes @@ -5652,7 +5652,7 @@ enum nl80211_feature_flags { * enum nl80211_ext_feature_index - bit index of extended features. * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can - * can request to use RRM (see %NL80211_ATTR_USE_RRM) with + * request to use RRM (see %NL80211_ATTR_USE_RRM) with * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set * the ASSOC_REQ_USE_RRM flag in the association request even if * NL80211_FEATURE_QUIET is not advertized. @@ -6061,7 +6061,7 @@ enum nl80211_dfs_state { }; /** - * enum enum nl80211_protocol_features - nl80211 protocol features + * enum nl80211_protocol_features - nl80211 protocol features * @NL80211_PROTOCOL_FEATURE_SPLIT_WIPHY_DUMP: nl80211 supports splitting * wiphy dumps (if requested by the application with the attribute * %NL80211_ATTR_SPLIT_WIPHY_DUMP. Also supported is filtering the -- cgit v1.2.3 From 0f55c0c500f2bbfc5cc5590cdf6973b3f64dc195 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Jul 2020 09:43:22 -0700 Subject: net/wireless: wireless.h: drop duplicate word in comments Drop doubled word "threshold" in a comment. Signed-off-by: Randy Dunlap Cc: netdev@vger.kernel.org Cc: Kalle Valo Cc: linux-wireless@vger.kernel.org Cc: Johannes Berg Link: https://lore.kernel.org/r/20200715164325.9109-2-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/wireless.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index 24f3371ad826..08967b3f19c8 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -914,7 +914,7 @@ union iwreq_data { struct iw_param sens; /* signal level threshold */ struct iw_param bitrate; /* default bit rate */ struct iw_param txpower; /* default transmit power */ - struct iw_param rts; /* RTS threshold threshold */ + struct iw_param rts; /* RTS threshold */ struct iw_param frag; /* Fragmentation threshold */ __u32 mode; /* Operation mode */ struct iw_param retry; /* Retry limits & lifetime */ -- cgit v1.2.3 From e3718a611470d311a92c60d4eb535270b49a7108 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Wed, 17 Jun 2020 09:30:33 +0200 Subject: cfg80211/mac80211: add mesh_param "mesh_nolearn" to skip path discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, before being able to forward a packet between two 802.11s nodes, both a PLINK handshake is performed upon receiving a beacon and then later a PREQ/PREP exchange for path discovery is performed on demand upon receiving a data frame to forward. When running a mesh protocol on top of an 802.11s interface, like batman-adv, we do not need the multi-hop mesh routing capabilities of 802.11s and usually set mesh_fwding=0. However, even with mesh_fwding=0 the PREQ/PREP path discovery is still performed on demand. Even though in this scenario the next hop PREQ/PREP will determine is always the direct 11s neighbor node. The new mesh_nolearn parameter allows to skip the PREQ/PREP exchange in this scenario, leading to a reduced delay, reduced packet buffering and simplifies HWMP in general. mesh_nolearn is still rather conservative in that if the packet destination is not a direct 11s neighbor, it will fall back to PREQ/PREP path discovery. For normal, multi-hop 802.11s mesh routing it is usually not advisable to enable mesh_nolearn as a transmission to a direct but distant neighbor might be worse than reaching that same node via a more robust / higher throughput etc. multi-hop path. Cc: Sven Eckelmann Cc: Simon Wunderlich Signed-off-by: Linus Lüssing Link: https://lore.kernel.org/r/20200617073034.26149-1-linus.luessing@c0d3.blue [fix nl80211 policy to range 0/1 only] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 7 +++++++ net/mac80211/cfg.c | 2 ++ net/mac80211/debugfs_netdev.c | 2 ++ net/mac80211/mesh_hwmp.c | 39 +++++++++++++++++++++++++++++++++++++++ net/wireless/mesh.c | 1 + net/wireless/nl80211.c | 7 ++++++- net/wireless/trace.h | 4 +++- 8 files changed, 66 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fa4d5627397f..78b220950942 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1870,6 +1870,11 @@ struct bss_parameters { * connected to a mesh gate in mesh formation info. If false, the * value in mesh formation is determined by the presence of root paths * in the mesh path table + * @dot11MeshNolearn: Try to avoid multi-hop path discovery (e.g. PREQ/PREP + * for HWMP) if the destination is a direct neighbor. Note that this might + * not be the optimal decision as a multi-hop route might be better. So + * if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. */ struct mesh_config { u16 dot11MeshRetryTimeout; @@ -1901,6 +1906,7 @@ struct mesh_config { enum nl80211_mesh_power_mode power_mode; u16 dot11MeshAwakeWindowDuration; u32 plink_timeout; + bool dot11MeshNolearn; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f47a7a8d0216..a83d8faf88ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4236,6 +4236,12 @@ enum nl80211_mesh_power_mode { * field. If left unset then the mesh formation field will only * advertise such if there is an active root mesh path. * + * @NL80211_MESHCONF_NOLEARN: Try to avoid multi-hop path discovery (e.g. + * PREQ/PREP for HWMP) if the destination is a direct neighbor. Note that + * this might not be the optimal decision as a multi-hop route might be + * better. So if using this setting you will likely also want to disable + * dot11MeshForwarding and use another mesh routing protocol on top. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -4269,6 +4275,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_AWAKE_WINDOW, NL80211_MESHCONF_PLINK_TIMEOUT, NL80211_MESHCONF_CONNECTED_TO_GATE, + NL80211_MESHCONF_NOLEARN, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b4a74064675e..9af56b848544 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2126,6 +2126,8 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask)) conf->dot11MeshConnectedToMeshGate = nconf->dot11MeshConnectedToMeshGate; + if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) + conf->dot11MeshNolearn = nconf->dot11MeshNolearn; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index d7e955127d5c..09eab2c3f380 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -638,6 +638,7 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC); IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); +IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -762,6 +763,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(power_mode); MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); + MESHPARAMS_ADD(dot11MeshNolearn); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index bae3a3e15b88..bec23d2eee7a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1172,6 +1172,40 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, return -ENOENT; } +/** + * mesh_nexthop_lookup_nolearn - try to set next hop without path discovery + * @skb: 802.11 frame to be sent + * @sdata: network subif the frame will be sent through + * + * Check if the meshDA (addr3) of a unicast frame is a direct neighbor. + * And if so, set the RA (addr1) to it to transmit to this node directly, + * avoiding PREQ/PREP path discovery. + * + * Returns: 0 if the next hop was found and -ENOENT otherwise. + */ +static int mesh_nexthop_lookup_nolearn(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + struct sta_info *sta; + + if (is_multicast_ether_addr(hdr->addr1)) + return -ENOENT; + + rcu_read_lock(); + sta = sta_info_get(sdata, hdr->addr3); + + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { + rcu_read_unlock(); + return -ENOENT; + } + rcu_read_unlock(); + + memcpy(hdr->addr1, hdr->addr3, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + return 0; +} + /** * mesh_nexthop_lookup - put the appropriate next hop on a mesh frame. Calling * this function is considered "using" the associated mpath, so preempt a path @@ -1185,11 +1219,16 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata, int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_path *mpath; struct sta_info *next_hop; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; u8 *target_addr = hdr->addr3; + if (ifmsh->mshcfg.dot11MeshNolearn && + !mesh_nexthop_lookup_nolearn(sdata, skb)) + return 0; + mpath = mesh_path_lookup(sdata, target_addr); if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) return -ENOENT; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index eac5aa1419fc..e4e363138279 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -78,6 +78,7 @@ const struct mesh_config default_mesh_config = { .power_mode = NL80211_MESH_POWER_ACTIVE, .dot11MeshAwakeWindowDuration = MESH_DEFAULT_AWAKE_WINDOW, .plink_timeout = MESH_DEFAULT_PLINK_TIMEOUT, + .dot11MeshNolearn = false, }; const struct mesh_setup default_mesh_setup = { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6fdf818f66cf..257c06315464 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6885,7 +6885,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u32(msg, NL80211_MESHCONF_PLINK_TIMEOUT, cur_params.plink_timeout) || nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, - cur_params.dot11MeshConnectedToMeshGate)) + cur_params.dot11MeshConnectedToMeshGate) || + nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, + cur_params.dot11MeshNolearn)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6943,6 +6945,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_AWAKE_WINDOW] = { .type = NLA_U16 }, [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7094,6 +7097,8 @@ do { \ NL80211_MESHCONF_AWAKE_WINDOW, nla_get_u16); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, plink_timeout, mask, NL80211_MESHCONF_PLINK_TIMEOUT, nla_get_u32); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshNolearn, mask, + NL80211_MESHCONF_NOLEARN, nla_get_u8); if (mask_out) *mask_out = mask; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b23cab016521..6e218a0acd4e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -68,7 +68,8 @@ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ - __field(u16, dot11MeshHWMPconfirmationInterval) + __field(u16, dot11MeshHWMPconfirmationInterval) \ + __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ @@ -109,6 +110,7 @@ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ + __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ -- cgit v1.2.3 From 184eebe664f0e11c485f6d309fe56297b3f75e9e Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 11 Jun 2020 16:02:37 +0200 Subject: cfg80211/mac80211: add connected to auth server to meshconf Besides information about num of peerings and gate connectivity, the mesh formation byte also contains a flag for authentication server connectivity, that currently cannot be set in the mesh conf. This patch adds this capability, which is necessary to implement 802.1X authentication in mesh mode. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200611140238.427461-1-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + include/uapi/linux/nl80211.h | 5 +++++ net/mac80211/cfg.c | 3 +++ net/mac80211/debugfs_netdev.c | 3 +++ net/mac80211/mesh.c | 5 ++++- net/wireless/nl80211.c | 8 +++++++- 6 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 78b220950942..8d5071f84ffe 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1895,6 +1895,7 @@ struct mesh_config { u16 dot11MeshHWMPnetDiameterTraversalTime; u8 dot11MeshHWMPRootMode; bool dot11MeshConnectedToMeshGate; + bool dot11MeshConnectedToAuthServer; u16 dot11MeshHWMPRannInterval; bool dot11MeshGateAnnouncementProtocol; bool dot11MeshForwarding; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a83d8faf88ac..f1770e3756f4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4242,6 +4242,10 @@ enum nl80211_mesh_power_mode { * better. So if using this setting you will likely also want to disable * dot11MeshForwarding and use another mesh routing protocol on top. * + * @NL80211_MESHCONF_CONNECTED_TO_AS: If set to true then this mesh STA + * will advertise that it is connected to a authentication server + * in the mesh formation field. + * * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use */ enum nl80211_meshconf_params { @@ -4276,6 +4280,7 @@ enum nl80211_meshconf_params { NL80211_MESHCONF_PLINK_TIMEOUT, NL80211_MESHCONF_CONNECTED_TO_GATE, NL80211_MESHCONF_NOLEARN, + NL80211_MESHCONF_CONNECTED_TO_AS, /* keep last */ __NL80211_MESHCONF_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9af56b848544..6a6531a50e54 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2128,6 +2128,9 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy, nconf->dot11MeshConnectedToMeshGate; if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask)) conf->dot11MeshNolearn = nconf->dot11MeshNolearn; + if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask)) + conf->dot11MeshConnectedToAuthServer = + nconf->dot11MeshConnectedToAuthServer; ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON); return 0; } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 09eab2c3f380..fe8a7a87e513 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -639,6 +639,8 @@ IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration, IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate, u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC); IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC); +IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer, + u.mesh.mshcfg.dot11MeshConnectedToAuthServer, DEC); #endif #define DEBUGFS_ADD_MODE(name, mode) \ @@ -764,6 +766,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata) MESHPARAMS_ADD(dot11MeshAwakeWindowDuration); MESHPARAMS_ADD(dot11MeshConnectedToMeshGate); MESHPARAMS_ADD(dot11MeshNolearn); + MESHPARAMS_ADD(dot11MeshConnectedToAuthServer); #undef MESHPARAMS_ADD } #endif diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 96f0323c0a3d..d0db6af16427 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -260,6 +260,7 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, bool is_connected_to_gate = ifmsh->num_gates > 0 || ifmsh->mshcfg.dot11MeshGateAnnouncementProtocol || ifmsh->mshcfg.dot11MeshConnectedToMeshGate; + bool is_connected_to_as = ifmsh->mshcfg.dot11MeshConnectedToAuthServer; if (skb_tailroom(skb) < 2 + meshconf_len) return -ENOMEM; @@ -284,7 +285,9 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh Formation Info - number of neighbors */ neighbors = atomic_read(&ifmsh->estab_plinks); neighbors = min_t(int, neighbors, IEEE80211_MAX_MESH_PEERINGS); - *pos++ = (neighbors << 1) | is_connected_to_gate; + *pos++ = (is_connected_to_as << 7) | + (neighbors << 1) | + is_connected_to_gate; /* Mesh capability */ *pos = 0x00; *pos |= ifmsh->mshcfg.dot11MeshForwarding ? diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 257c06315464..434fd06dc5cf 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6887,7 +6887,9 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_GATE, cur_params.dot11MeshConnectedToMeshGate) || nla_put_u8(msg, NL80211_MESHCONF_NOLEARN, - cur_params.dot11MeshNolearn)) + cur_params.dot11MeshNolearn) || + nla_put_u8(msg, NL80211_MESHCONF_CONNECTED_TO_AS, + cur_params.dot11MeshConnectedToAuthServer)) goto nla_put_failure; nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); @@ -6946,6 +6948,7 @@ nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] = { [NL80211_MESHCONF_PLINK_TIMEOUT] = { .type = NLA_U32 }, [NL80211_MESHCONF_CONNECTED_TO_GATE] = NLA_POLICY_RANGE(NLA_U8, 0, 1), [NL80211_MESHCONF_NOLEARN] = NLA_POLICY_RANGE(NLA_U8, 0, 1), + [NL80211_MESHCONF_CONNECTED_TO_AS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static const struct nla_policy @@ -7058,6 +7061,9 @@ do { \ FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToMeshGate, mask, NL80211_MESHCONF_CONNECTED_TO_GATE, nla_get_u8); + FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConnectedToAuthServer, mask, + NL80211_MESHCONF_CONNECTED_TO_AS, + nla_get_u8); /* * Check HT operation mode based on * IEEE 802.11-2016 9.4.2.57 HT Operation element. -- cgit v1.2.3 From 1303a51c24100b3b1915d6f9072fe5ae5bb4c5f6 Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Thu, 11 Jun 2020 16:02:38 +0200 Subject: cfg80211/mac80211: add connected to auth server to station info This patch adds the necessary bits to later query the auth server flag for every peer from iw. Signed-off-by: Markus Theil Link: https://lore.kernel.org/r/20200611140238.427461-2-markus.theil@tu-ilmenau.de Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 3 +++ net/mac80211/sta_info.c | 4 +++- net/mac80211/sta_info.h | 2 ++ net/wireless/nl80211.c | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8d5071f84ffe..39fe21edd2c5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1598,6 +1598,7 @@ struct cfg80211_tid_stats { * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. * @airtime_link_metric: mesh airtime link metric. + * @connected_to_as: true if mesh STA has a path to authentication server */ struct station_info { u64 filled; @@ -1655,6 +1656,8 @@ struct station_info { u32 fcs_err_count; u32 airtime_link_metric; + + u8 connected_to_as; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f1770e3756f4..d6b6599a6001 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3370,6 +3370,8 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station * @NL80211_STA_INFO_ASSOC_AT_BOOTTIME: Timestamp (CLOCK_BOOTTIME, nanoseconds) * of STA's association + * @NL80211_STA_INFO_CONNECTED_TO_AS: set to true if STA has a path to a + * authentication server (u8, 0 or 1) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3417,6 +3419,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_AIRTIME_WEIGHT, NL80211_STA_INFO_AIRTIME_LINK_METRIC, NL80211_STA_INFO_ASSOC_AT_BOOTTIME, + NL80211_STA_INFO_CONNECTED_TO_AS, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cd8487bc6fc2..a39773b40457 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2426,7 +2426,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE); + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; @@ -2439,6 +2440,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; + sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 49728047dfad..9d398c9daa4c 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -385,6 +385,7 @@ DECLARE_EWMA(mesh_tx_rate_avg, 8, 16) * @processed_beacon: set to true after peer rates and capabilities are * processed * @connected_to_gate: true if mesh STA has a path to a mesh gate + * @connected_to_as: true if mesh STA has a path to a authentication server * @fail_avg: moving percentage of failed MSDUs * @tx_rate_avg: moving average of tx bitrate */ @@ -404,6 +405,7 @@ struct mesh_sta { bool processed_beacon; bool connected_to_gate; + bool connected_to_as; enum nl80211_plink_state plink_state; u32 plink_timeout; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 434fd06dc5cf..13a38aab1565 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5395,6 +5395,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); + PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, -- cgit v1.2.3 From fd17dba1c860d39f655a3a08387c21e3ceca8c55 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Mon, 20 Jul 2020 13:12:25 +0530 Subject: cfg80211: Add support to advertize OCV support Add a new feature flag that drivers can use to advertize support for Operating Channel Validation (OCV) when using driver's SME for RSNA handshakes. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200720074225.8990-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d6b6599a6001..a3ae2b060a55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5804,6 +5804,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS: The driver * can report tx status for control port over nl80211 tx operations. * + * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating + * Channel Validation (OCV) when using driver's SME for RSNA handshakes. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5859,6 +5862,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT, NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, + NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From f96622749a67d40ad5efe8a58d5fc95313097aa0 Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Tue, 23 Jun 2020 08:49:35 -0500 Subject: nl80211: support 4-way handshake offloading for WPA/WPA2-PSK in AP mode Let drivers advertise support for AP-mode WPA/WPA2-PSK 4-way handshake offloading with a new NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag. Extend use of NL80211_ATTR_PMK attribute indicating it might be passed as part of NL80211_CMD_START_AP command, and contain the PSK (which is the PMK, hence the name). The driver is assumed to handle the 4-way handshake by itself in this case, instead of relying on userspace. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Link: https://lore.kernel.org/r/20200623134938.39997-2-chi-hsien.lin@cypress.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 41 ++++++++++++++++++++++++++++------------- net/wireless/nl80211.c | 4 +++- 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a3ae2b060a55..631f3a997b3c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -183,18 +183,27 @@ * * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK flag drivers * can indicate they support offloading EAPOL handshakes for WPA/WPA2 - * preshared key authentication. In %NL80211_CMD_CONNECT the preshared - * key should be specified using %NL80211_ATTR_PMK. Drivers supporting - * this offload may reject the %NL80211_CMD_CONNECT when no preshared - * key material is provided, for example when that driver does not - * support setting the temporal keys through %CMD_NEW_KEY. + * preshared key authentication in station mode. In %NL80211_CMD_CONNECT + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_CONNECT when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. * * Similarly @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X flag can be * set by drivers indicating offload support of the PTK/GTK EAPOL - * handshakes during 802.1X authentication. In order to use the offload - * the %NL80211_CMD_CONNECT should have %NL80211_ATTR_WANT_1X_4WAY_HS - * attribute flag. Drivers supporting this offload may reject the - * %NL80211_CMD_CONNECT when the attribute flag is not present. + * handshakes during 802.1X authentication in station mode. In order to + * use the offload the %NL80211_CMD_CONNECT should have + * %NL80211_ATTR_WANT_1X_4WAY_HS attribute flag. Drivers supporting this + * offload may reject the %NL80211_CMD_CONNECT when the attribute flag is + * not present. + * + * By setting @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK flag drivers + * can indicate they support offloading EAPOL handshakes for WPA/WPA2 + * preshared key authentication in AP mode. In %NL80211_CMD_START_AP + * the preshared key should be specified using %NL80211_ATTR_PMK. Drivers + * supporting this offload may reject the %NL80211_CMD_START_AP when no + * preshared key material is provided, for example when that driver does + * not support setting the temporal keys through %NL80211_CMD_NEW_KEY. * * For 802.1X the PMK or PMK-R0 are set by providing %NL80211_ATTR_PMK * using %NL80211_CMD_SET_PMK. For offloaded FT support also @@ -2362,10 +2371,11 @@ enum nl80211_commands { * * @NL80211_ATTR_PMK: attribute for passing PMK key material. Used with * %NL80211_CMD_SET_PMKSA for the PMKSA identified by %NL80211_ATTR_PMKID. - * For %NL80211_CMD_CONNECT it is used to provide PSK for offloading 4-way - * handshake for WPA/WPA2-PSK networks. For 802.1X authentication it is - * used with %NL80211_CMD_SET_PMK. For offloaded FT support this attribute - * specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME is included as well. + * For %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP it is used to provide + * PSK for offloading 4-way handshake for WPA/WPA2-PSK networks. For 802.1X + * authentication it is used with %NL80211_CMD_SET_PMK. For offloaded FT + * support this attribute specifies the PMK-R0 if NL80211_ATTR_PMKR0_NAME + * is included as well. * * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to * indicate that it supports multiple active scheduled scan requests. @@ -5807,6 +5817,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION: Driver supports Operating * Channel Validation (OCV) when using driver's SME for RSNA handshakes. * + * @NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK: Device wants to do 4-way + * handshake with PSK in AP mode (PSK is passed as part of the start AP + * command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5863,6 +5877,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCAN_FREQ_KHZ, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8d78a6fc59a3..a096682ec0ad 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9442,7 +9442,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (nla_len(info->attrs[NL80211_ATTR_PMK]) != WLAN_PMK_LEN) return -EINVAL; if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK)) + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_PSK) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK)) return -EINVAL; settings->psk = nla_data(info->attrs[NL80211_ATTR_PMK]); } -- cgit v1.2.3 From 48040793fa6003d211f021c6ad273477bcd90d91 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Thu, 30 Jul 2020 15:44:40 -0700 Subject: tcp: add earliest departure time to SCM_TIMESTAMPING_OPT_STATS This change adds TCP_NLA_EDT to SCM_TIMESTAMPING_OPT_STATS that reports the earliest departure time(EDT) of the timestamped skb. By tracking EDT values of the skb from different timestamps, we can observe when and how much the value changed. This allows to measure the precise delay injected on the sender host e.g. by a bpf-base throttler. Signed-off-by: Yousuk Seung Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/tcp.h | 3 ++- include/uapi/linux/tcp.h | 1 + net/core/skbuff.c | 2 +- net/ipv4/tcp.c | 6 +++++- 4 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 527d668a5275..14b62d7df942 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -484,7 +484,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk); +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index f2acb2566333..cfcb10b75483 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -313,6 +313,7 @@ enum { TCP_NLA_SRTT, /* smoothed RTT in usecs */ TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ + TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b8afefe6f6b6..4e2edfbe0e19 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4692,7 +4692,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - skb = tcp_get_timestamping_opt_stats(sk); + skb = tcp_get_timestamping_opt_stats(sk, orig_skb); opt_stats = true; } else #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4afec552f211..c06d2bfd2ec4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3501,10 +3501,12 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ + nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ 0; } -struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) +struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, + const struct sk_buff *orig_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; @@ -3558,6 +3560,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); + nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, + TCP_NLA_PAD); return stats; } -- cgit v1.2.3 From 829eb208e80d6db95c0201cb8fa00c2f9ad87faf Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 31 Jul 2020 17:34:01 -0700 Subject: rtnetlink: add support for protodown reason netdev protodown is a mechanism that allows protocols to hold an interface down. It was initially introduced in the kernel to hold links down by a multihoming protocol. There was also an attempt to introduce protodown reason at the time but was rejected. protodown and protodown reason is supported by almost every switching and routing platform. It was ok for a while to live without a protodown reason. But, its become more critical now given more than one protocol may need to keep a link down on a system at the same time. eg: vrrp peer node, port security, multihoming protocol. Its common for Network operators and protocol developers to look for such a reason on a networking box (Its also known as errDisable by most networking operators) This patch adds support for link protodown reason attribute. There are two ways to maintain protodown reasons. (a) enumerate every possible reason code in kernel - A protocol developer has to make a request and have that appear in a certain kernel version (b) provide the bits in the kernel, and allow user-space (sysadmin or NOS distributions) to manage the bit-to-reasonname map. - This makes extending reason codes easier (kind of like the iproute2 table to vrf-name map /etc/iproute2/rt_tables.d/) This patch takes approach (b). a few things about the patch: - It treats the protodown reason bits as counter to indicate active protodown users - Since protodown attribute is already an exposed UAPI, the reason is not enforced on a protodown set. Its a no-op if not used. the patch follows the below algorithm: - presence of reason bits set indicates protodown is in use - user can set protodown and protodown reason in a single or multiple setlink operations - setlink operation to clear protodown, will return -EBUSY if there are active protodown reason bits - reason is not included in link dumps if not used example with patched iproute2: $cat /etc/iproute2/protodown_reasons.d/r.conf 0 mlag 1 evpn 2 vrrp 3 psecurity $ip link set dev vxlan0 protodown on protodown_reason vrrp on $ip link set dev vxlan0 protodown_reason mlag on $ip link show 14: vxlan0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether f6:06:be:17:91:e7 brd ff:ff:ff:ff:ff:ff protodown on $ip link set dev vxlan0 protodown_reason mlag off $ip link set dev vxlan0 protodown off protodown_reason vrrp off Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++ include/uapi/linux/if_link.h | 10 ++++ net/core/dev.c | 25 ++++++++++ net/core/rtnetlink.c | 113 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 147 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ac2cd3f49aba..ba0fa6b22787 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2058,6 +2058,8 @@ struct net_device { struct timer_list watchdog_timer; int watchdog_timeo; + u32 proto_down_reason; + struct list_head todo_list; int __percpu *pcpu_refcnt; @@ -3810,6 +3812,8 @@ int dev_get_port_parent_id(struct net_device *dev, bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 63af64646358..7fba4de511de 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -170,12 +170,22 @@ enum { IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, __IFLA_MAX }; #define IFLA_MAX (__IFLA_MAX - 1) +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + /* backwards compatibility for userspace */ #ifndef __KERNEL__ #define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) diff --git a/net/core/dev.c b/net/core/dev.c index 38a6371d9bc5..f7ef0f5c5569 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8715,6 +8715,31 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) +{ + int b; + + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} +EXPORT_SYMBOL(dev_change_proto_down_reason); + u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, enum bpf_netdev_command cmd) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..a54c3e0f2ee1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1000,6 +1000,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev) return size; } +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1041,7 +1051,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ - + nla_total_size(1) /* IFLA_PROTO_DOWN */ + + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ @@ -1658,6 +1668,35 @@ nest_cancel: return ret; } +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1708,13 +1747,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; @@ -1834,6 +1875,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2483,6 +2525,67 @@ static int do_set_master(struct net_device *dev, int ifindex, return 0; } +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + const struct net_device_ops *ops = dev->netdev_ops; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!ops->ndo_change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Dont turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 @@ -2771,9 +2874,9 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; - if (tb[IFLA_PROTO_DOWN]) { - err = dev_change_proto_down(dev, - nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; -- cgit v1.2.3 From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 31 Jul 2020 11:28:26 -0700 Subject: bpf: Add support for forced LINK_DETACH command Add LINK_DETACH command to force-detach bpf_link without destroying it. It has the same behavior as auto-detaching of bpf_link due to cgroup dying for bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case, bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF program attached to corresponding BPF hook. This functionality allows users with enough access rights to manually force-detach attached bpf_link without killing respective owner process. This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly re-using existing link release handling code. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 15 ++++++++++++++- kernel/bpf/net_namespace.c | 8 ++++++++ kernel/bpf/syscall.c | 26 ++++++++++++++++++++++++++ net/core/dev.c | 11 ++++++++++- 6 files changed, 64 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 40c5e206ecf2..cef4ef0d2b4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -793,6 +793,7 @@ struct bpf_link { struct bpf_link_ops { void (*release)(struct bpf_link *link); void (*dealloc)(struct bpf_link *link); + int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb5e0c38eb2c..b134e679e9db 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_cmd { BPF_LINK_GET_NEXT_ID, BPF_ENABLE_STATS, BPF_ITER_CREATE, + BPF_LINK_DETACH, }; enum bpf_map_type { @@ -634,6 +635,10 @@ union bpf_attr { __u32 old_prog_fd; } link_update; + struct { + __u32 link_fd; + } link_detach; + struct { /* struct used by BPF_ENABLE_STATS command */ __u32 type; } enable_stats; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 957cce1d5168..83ff127ef7ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) { struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link); + struct cgroup *cg; /* link might have been auto-detached by dying cgroup already, * in that case our work is done here @@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link) WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, cg_link->type)); + cg = cg_link->cgroup; + cg_link->cgroup = NULL; + mutex_unlock(&cgroup_mutex); - cgroup_put(cg_link->cgroup); + + cgroup_put(cg); } static void bpf_cgroup_link_dealloc(struct bpf_link *link) @@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link) kfree(cg_link); } +static int bpf_cgroup_link_detach(struct bpf_link *link) +{ + bpf_cgroup_link_release(link); + + return 0; +} + static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { @@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_cgroup_link_lops = { .release = bpf_cgroup_link_release, .dealloc = bpf_cgroup_link_dealloc, + .detach = bpf_cgroup_link_detach, .update_prog = cgroup_bpf_replace, .show_fdinfo = bpf_cgroup_link_show_fdinfo, .fill_link_info = bpf_cgroup_link_fill_link_info, diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 71405edd667c..542f275bf252 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link) bpf_prog_array_free(old_array); out_unlock: + net_link->net = NULL; mutex_unlock(&netns_bpf_mutex); } +static int bpf_netns_link_detach(struct bpf_link *link) +{ + bpf_netns_link_release(link); + return 0; +} + static void bpf_netns_link_dealloc(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, static const struct bpf_link_ops bpf_netns_link_ops = { .release = bpf_netns_link_release, .dealloc = bpf_netns_link_dealloc, + .detach = bpf_netns_link_detach, .update_prog = bpf_netns_link_update_prog, .fill_link_info = bpf_netns_link_fill_info, .show_fdinfo = bpf_netns_link_show_fdinfo, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd3d599e9e90..2f343ce15747 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3991,6 +3991,29 @@ out_put_link: return ret; } +#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd + +static int link_detach(union bpf_attr *attr) +{ + struct bpf_link *link; + int ret; + + if (CHECK_ATTR(BPF_LINK_DETACH)) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->link_detach.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (link->ops->detach) + ret = link->ops->detach(link); + else + ret = -EOPNOTSUPP; + + bpf_link_put(link); + return ret; +} + static int bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; @@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; + case BPF_LINK_DETACH: + err = link_detach(&attr); + break; default: err = -EINVAL; break; diff --git a/net/core/dev.c b/net/core/dev.c index a2a57988880a..c8b911b10187 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8979,12 +8979,20 @@ static void bpf_xdp_link_release(struct bpf_link *link) /* if racing with net_device's tear down, xdp_link->dev might be * already NULL, in which case link was already auto-detached */ - if (xdp_link->dev) + if (xdp_link->dev) { WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + xdp_link->dev = NULL; + } rtnl_unlock(); } +static int bpf_xdp_link_detach(struct bpf_link *link) +{ + bpf_xdp_link_release(link); + return 0; +} + static void bpf_xdp_link_dealloc(struct bpf_link *link) { struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); @@ -9066,6 +9074,7 @@ out_unlock: static const struct bpf_link_ops bpf_xdp_link_lops = { .release = bpf_xdp_link_release, .dealloc = bpf_xdp_link_dealloc, + .detach = bpf_xdp_link_detach, .show_fdinfo = bpf_xdp_link_show_fdinfo, .fill_link_info = bpf_xdp_link_fill_link_info, .update_prog = bpf_xdp_link_update, -- cgit v1.2.3 From 9d2f627b7ec9d5d3246b6cec17f290ee6778c83b Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 31 Jul 2020 14:20:56 +0200 Subject: net: openvswitch: add masks cache hit counter Add a counter that counts the number of masks cache hits, and export it through the megaflow netlink statistics. Reviewed-by: Paolo Abeni Reviewed-by: Tonghao Zhang Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/datapath.c | 5 ++++- net/openvswitch/datapath.h | 3 +++ net/openvswitch/flow_table.c | 19 ++++++++++++++----- net/openvswitch/flow_table.h | 3 ++- 5 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 9b14519e74d9..7cb76e5ca7cf 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -102,8 +102,8 @@ struct ovs_dp_megaflow_stats { __u64 n_mask_hit; /* Number of masks used for flow lookups. */ __u32 n_masks; /* Number of masks for the datapath. */ __u32 pad0; /* Pad for future expension. */ + __u64 n_cache_hit; /* Number of cache matches for flow lookups. */ __u64 pad1; /* Pad for future expension. */ - __u64 pad2; /* Pad for future expension. */ }; struct ovs_vport_stats { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6b6822f82f70..f45fee760504 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -225,13 +225,14 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) struct dp_stats_percpu *stats; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; @@ -262,6 +263,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -699,6 +701,7 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 24fcec22fde2..38f7d3e66ca6 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -38,12 +38,15 @@ * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; + u64 n_cache_hit; struct u64_stats_sync syncp; }; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index af22c9ee28dd..a5912ea05352 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -667,6 +667,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, + u32 *n_cache_hit, u32 *index) { u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr); @@ -682,6 +683,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, u64_stats_update_begin(&ma->syncp); usage_counters[*index]++; u64_stats_update_end(&ma->syncp); + (*n_cache_hit)++; return flow; } } @@ -719,7 +721,8 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, - u32 *n_mask_hit) + u32 *n_mask_hit, + u32 *n_cache_hit) { struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); @@ -729,10 +732,13 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, int seg; *n_mask_hit = 0; + *n_cache_hit = 0; if (unlikely(!skb_hash)) { u32 mask_index = 0; + u32 cache = 0; - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash @@ -753,7 +759,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); + n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; @@ -766,10 +772,12 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, } /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); if (flow) ce->skb_hash = skb_hash; + *n_cache_hit = 0; return flow; } @@ -779,9 +787,10 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; u32 index = 0; - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); + return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 1f664b050e3b..325e939371d8 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -82,7 +82,8 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, const struct sw_flow_key *, u32 skb_hash, - u32 *n_mask_hit); + u32 *n_mask_hit, + u32 *n_cache_hit); struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, const struct sw_flow_key *); struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, -- cgit v1.2.3 From 9bf24f594c6acf676fb8c229f152c21bfb915ddb Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 31 Jul 2020 14:21:34 +0200 Subject: net: openvswitch: make masks cache size configurable This patch makes the masks cache size configurable, or with a size of 0, disable it. Reviewed-by: Paolo Abeni Reviewed-by: Tonghao Zhang Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/datapath.c | 17 +++++++ net/openvswitch/flow_table.c | 101 ++++++++++++++++++++++++++++++++++----- net/openvswitch/flow_table.h | 10 +++- 4 files changed, 115 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7cb76e5ca7cf..8300cc29dec8 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -86,6 +86,7 @@ enum ovs_datapath_attr { OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */ OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ OVS_DP_ATTR_PAD, + OVS_DP_ATTR_MASKS_CACHE_SIZE, __OVS_DP_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f45fee760504..42f8cc70bb2c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1498,6 +1498,7 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ return msgsize; } @@ -1535,6 +1536,10 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + genlmsg_end(skb, ovs_header); return 0; @@ -1599,6 +1604,16 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) #endif } + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + dp->user_features = user_features; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) @@ -1887,6 +1902,8 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), }; static const struct genl_ops dp_datapath_genl_ops[] = { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index a5912ea05352..6527d84c3ea6 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -38,8 +38,8 @@ #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) +#define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; @@ -341,15 +341,79 @@ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) } } +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; struct mask_array *ma; - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, - __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); @@ -367,6 +431,7 @@ int ovs_flow_tbl_init(struct flow_table *table) rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; @@ -377,7 +442,7 @@ free_ti: free_mask_array: __mask_array_destroy(ma); free_mask_cache: - free_percpu(table->mask_cache); + __mask_cache_destroy(mc); return -ENOMEM; } @@ -453,9 +518,11 @@ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference(table->mask_cache); + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); - free_percpu(table->mask_cache); - call_rcu(&table->mask_array->rcu, mask_array_rcu_cb); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(table, ti, ufid_ti, false); } @@ -724,6 +791,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, u32 *n_mask_hit, u32 *n_cache_hit) { + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; @@ -733,7 +801,7 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, *n_mask_hit = 0; *n_cache_hit = 0; - if (unlikely(!skb_hash)) { + if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; u32 cache = 0; @@ -749,11 +817,11 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, ce = NULL; hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); + entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); + int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; @@ -867,6 +935,13 @@ int ovs_flow_tbl_num_masks(const struct flow_table *table) return READ_ONCE(ma->count); } +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { @@ -1095,8 +1170,8 @@ void ovs_flow_masks_rebalance(struct flow_table *table) for (i = 0; i < masks_entries; i++) { int index = masks_and_count[i].index; - new->masks[new->count++] = - rcu_dereference_ovsl(ma->masks[index]); + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; } rcu_assign_pointer(table->mask_array, new); diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 325e939371d8..74ce48fecba9 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -27,6 +27,12 @@ struct mask_cache_entry { u32 mask_index; }; +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + struct mask_count { int index; u64 counter; @@ -53,7 +59,7 @@ struct table_instance { struct flow_table { struct table_instance __rcu *ti; struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; + struct mask_cache __rcu *mask_cache; struct mask_array __rcu *mask_array; unsigned long last_rehash; unsigned int count; @@ -77,6 +83,8 @@ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask); void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, u32 *bucket, u32 *idx); struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, -- cgit v1.2.3 From 88fab21c691bb1ff164e540735237a385e3afeaf Mon Sep 17 00:00:00 2001 From: Ioana-Ruxandra Stăncioi Date: Mon, 3 Aug 2020 07:33:33 +0000 Subject: seg6_iptunnel: Refactor seg6_lwt_headroom out of uapi header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the function seg6_lwt_headroom out of the seg6_iptunnel.h uapi header, because it is only used in seg6_iptunnel.c. Moreover, it is only used in the kernel code, as indicated by the "#ifdef __KERNEL__". Suggested-by: David Miller Signed-off-by: Ioana-Ruxandra Stăncioi Signed-off-by: David S. Miller --- include/uapi/linux/seg6_iptunnel.h | 21 --------------------- net/ipv6/seg6_iptunnel.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index 09fb608a35ec..eb815e0d0ac3 100644 --- a/include/uapi/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h @@ -37,25 +37,4 @@ enum { SEG6_IPTUN_MODE_L2ENCAP, }; -#ifdef __KERNEL__ - -static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) -{ - int head = 0; - - switch (tuninfo->mode) { - case SEG6_IPTUN_MODE_INLINE: - break; - case SEG6_IPTUN_MODE_ENCAP: - head = sizeof(struct ipv6hdr); - break; - case SEG6_IPTUN_MODE_L2ENCAP: - return 0; - } - - return ((tuninfo->srh->hdrlen + 1) << 3) + head; -} - -#endif - #endif diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index e0e9f48ab14f..897fa59c47de 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -27,6 +27,23 @@ #include #endif +static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) +{ + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; +} + struct seg6_lwt { struct dst_cache cache; struct seg6_iptunnel_encap tuninfo[]; -- cgit v1.2.3