7 files changed, 412 insertions, 36 deletions
diff --git a/Documentation/networking/ice.txt b/Documentation/networking/ice.txt
new file mode 100644
index 000000000000..6261c46378e1
--- /dev/null
+++ b/Documentation/networking/ice.txt
@@ -0,0 +1,39 @@
+Intel(R) Ethernet Connection E800 Series Linux Driver
+===================================================================
+
+Intel ice Linux driver.
+Copyright(c) 2018 Intel Corporation.
+
+Contents
+========
+- Enabling the driver
+- Support
+
+The driver in this release supports Intel's E800 Series of products. For
+more information, visit Intel's support page at http://support.intel.com.
+
+Enabling the driver
+===================
+
+The driver is enabled via the standard kernel configuration system,
+using the make command:
+
+     Make oldconfig/silentoldconfig/menuconfig/etc.
+
+The driver is located in the menu structure at:
+
+	-> Device Drivers
+	  -> Network device support (NETDEVICES [=y])
+	    -> Ethernet driver support
+	      -> Intel devices
+	        -> Intel(R) Ethernet Connection E800 Series Support
+
+Support
+=======
+
+For general information, go to the Intel support website at:
+
+    http://support.intel.com
+
+If an issue is identified with the released source code, please email
+the maintainer listed in the MAINTAINERS file.
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a553d4e4a0fb..5dc1a040a2f1 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -133,14 +133,11 @@ min_adv_mss - INTEGER
 
 IP Fragmentation:
 
-ipfrag_high_thresh - INTEGER
-	Maximum memory used to reassemble IP fragments. When
-	ipfrag_high_thresh bytes of memory is allocated for this purpose,
-	the fragment handler will toss packets until ipfrag_low_thresh
-	is reached. This also serves as a maximum limit to namespaces
-	different from the initial one.
-
-ipfrag_low_thresh - INTEGER
+ipfrag_high_thresh - LONG INTEGER
+	Maximum memory used to reassemble IP fragments.
+
+ipfrag_low_thresh - LONG INTEGER
+	(Obsolete since linux-4.17)
 	Maximum memory used to reassemble IP fragments before the kernel
 	begins to remove incomplete fragment queues to free up resources.
 	The kernel still accepts new fragments for defragmentation.
@@ -755,13 +752,13 @@ udp_rmem_min - INTEGER
 	Minimal size of receive buffer used by UDP sockets in moderation.
 	Each UDP socket is able to use the size for receiving data, even if
 	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-	Default: 1 page
+	Default: 4K
 
 udp_wmem_min - INTEGER
 	Minimal size of send buffer used by UDP sockets in moderation.
 	Each UDP socket is able to use the size for sending data, even if
 	total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-	Default: 1 page
+	Default: 4K
 
 CIPSOv4 Variables:
 
@@ -1363,6 +1360,13 @@ flowlabel_reflect - BOOLEAN
 	FALSE: disabled
 	Default: FALSE
 
+fib_multipath_hash_policy - INTEGER
+	Controls which hash policy to use for multipath routes.
+	Default: 0 (Layer 3)
+	Possible values:
+	0 - Layer 3 (source and destination addresses plus flow label)
+	1 - Layer 4 (standard 5-tuple)
+
 anycast_src_echo_reply - BOOLEAN
 	Controls the use of anycast addresses as source addresses for ICMPv6
 	echo reply
@@ -1696,7 +1700,9 @@ disable_ipv6 - BOOLEAN
 	interface and start Duplicate Address Detection, if necessary.
 
 	When this value is changed from 0 to 1 (IPv6 is being disabled),
-	it will dynamically delete all address on the given interface.
+	it will dynamically delete all addresses and routes on the given
+	interface. From now on it will not possible to add addresses/routes
+	to the selected interface.
 
 accept_dad - INTEGER
 	Whether to accept DAD (Duplicate Address Detection).
@@ -2094,7 +2100,7 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
 	It is guaranteed to each SCTP socket (but not association) even
 	under moderate memory pressure.
 
-	Default: 1 page
+	Default: 4K
 
 sctp_wmem  - vector of 3 INTEGERs: min, default, max
 	Currently this tunable has no effect.
diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst
index 291a01264967..fe46d4867e2d 100644
--- a/Documentation/networking/msg_zerocopy.rst
+++ b/Documentation/networking/msg_zerocopy.rst
@@ -72,11 +72,6 @@ this flag, a process must first signal intent by setting a socket option:
 	if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
 		error(1, errno, "setsockopt zerocopy");
 
-Setting the socket option only works when the socket is in its initial
-(TCP_CLOSED) state.  Trying to set the option for a socket returned by accept(),
-for example, will lead to an EBUSY error. In this case, the option should be set
-to the listening socket and it will be inherited by the accepted sockets.
-
 Transmission
 ------------
 
diff --git a/Documentation/networking/net_dim.txt b/Documentation/networking/net_dim.txt
new file mode 100644
index 000000000000..9cb31c5e2dcd
--- /dev/null
+++ b/Documentation/networking/net_dim.txt
@@ -0,0 +1,174 @@
+Net DIM - Generic Network Dynamic Interrupt Moderation
+======================================================
+
+Author:
+	Tal Gilboa <talgi@mellanox.com>
+
+
+Contents
+=========
+
+- Assumptions
+- Introduction
+- The Net DIM Algorithm
+- Registering a Network Device to DIM
+- Example
+
+Part 0: Assumptions
+======================
+
+This document assumes the reader has basic knowledge in network drivers
+and in general interrupt moderation.
+
+
+Part I: Introduction
+======================
+
+Dynamic Interrupt Moderation (DIM) (in networking) refers to changing the
+interrupt moderation configuration of a channel in order to optimize packet
+processing. The mechanism includes an algorithm which decides if and how to
+change moderation parameters for a channel, usually by performing an analysis on
+runtime data sampled from the system. Net DIM is such a mechanism. In each
+iteration of the algorithm, it analyses a given sample of the data, compares it
+to the previous sample and if required, it can decide to change some of the
+interrupt moderation configuration fields. The data sample is composed of data
+bandwidth, the number of packets and the number of events. The time between
+samples is also measured. Net DIM compares the current and the previous data and
+returns an adjusted interrupt moderation configuration object. In some cases,
+the algorithm might decide not to change anything. The configuration fields are
+the minimum duration (microseconds) allowed between events and the maximum
+number of wanted packets per event. The Net DIM algorithm ascribes importance to
+increase bandwidth over reducing interrupt rate.
+
+
+Part II: The Net DIM Algorithm
+===============================
+
+Each iteration of the Net DIM algorithm follows these steps:
+1. Calculates new data sample.
+2. Compares it to previous sample.
+3. Makes a decision - suggests interrupt moderation configuration fields.
+4. Applies a schedule work function, which applies suggested configuration.
+
+The first two steps are straightforward, both the new and the previous data are
+supplied by the driver registered to Net DIM. The previous data is the new data
+supplied to the previous iteration. The comparison step checks the difference
+between the new and previous data and decides on the result of the last step.
+A step would result as "better" if bandwidth increases and as "worse" if
+bandwidth reduces. If there is no change in bandwidth, the packet rate is
+compared in a similar fashion - increase == "better" and decrease == "worse".
+In case there is no change in the packet rate as well, the interrupt rate is
+compared. Here the algorithm tries to optimize for lower interrupt rate so an
+increase in the interrupt rate is considered "worse" and a decrease is
+considered "better". Step #2 has an optimization for avoiding false results: it
+only considers a difference between samples as valid if it is greater than a
+certain percentage. Also, since Net DIM does not measure anything by itself, it
+assumes the data provided by the driver is valid.
+
+Step #3 decides on the suggested configuration based on the result from step #2
+and the internal state of the algorithm. The states reflect the "direction" of
+the algorithm: is it going left (reducing moderation), right (increasing
+moderation) or standing still. Another optimization is that if a decision
+to stay still is made multiple times, the interval between iterations of the
+algorithm would increase in order to reduce calculation overhead. Also, after
+"parking" on one of the most left or most right decisions, the algorithm may
+decide to verify this decision by taking a step in the other direction. This is
+done in order to avoid getting stuck in a "deep sleep" scenario. Once a
+decision is made, an interrupt moderation configuration is selected from
+the predefined profiles.
+
+The last step is to notify the registered driver that it should apply the
+suggested configuration. This is done by scheduling a work function, defined by
+the Net DIM API and provided by the registered driver.
+
+As you can see, Net DIM itself does not actively interact with the system. It
+would have trouble making the correct decisions if the wrong data is supplied to
+it and it would be useless if the work function would not apply the suggested
+configuration. This does, however, allow the registered driver some room for
+manoeuvre as it may provide partial data or ignore the algorithm suggestion
+under some conditions.
+
+
+Part III: Registering a Network Device to DIM
+==============================================
+
+Net DIM API exposes the main function net_dim(struct net_dim *dim,
+struct net_dim_sample end_sample). This function is the entry point to the Net
+DIM algorithm and has to be called every time the driver would like to check if
+it should change interrupt moderation parameters. The driver should provide two
+data structures: struct net_dim and struct net_dim_sample. Struct net_dim
+describes the state of DIM for a specific object (RX queue, TX queue,
+other queues, etc.). This includes the current selected profile, previous data
+samples, the callback function provided by the driver and more.
+Struct net_dim_sample describes a data sample, which will be compared to the
+data sample stored in struct net_dim in order to decide on the algorithm's next
+step. The sample should include bytes, packets and interrupts, measured by
+the driver.
+
+In order to use Net DIM from a networking driver, the driver needs to call the
+main net_dim() function. The recommended method is to call net_dim() on each
+interrupt. Since Net DIM has a built-in moderation and it might decide to skip
+iterations under certain conditions, there is no need to moderate the net_dim()
+calls as well. As mentioned above, the driver needs to provide an object of type
+struct net_dim to the net_dim() function call. It is advised for each entity
+using Net DIM to hold a struct net_dim as part of its data structure and use it
+as the main Net DIM API object. The struct net_dim_sample should hold the latest
+bytes, packets and interrupts count. No need to perform any calculations, just
+include the raw data.
+
+The net_dim() call itself does not return anything. Instead Net DIM relies on
+the driver to provide a callback function, which is called when the algorithm
+decides to make a change in the interrupt moderation parameters. This callback
+will be scheduled and run in a separate thread in order not to add overhead to
+the data flow. After the work is done, Net DIM algorithm needs to be set to
+the proper state in order to move to the next iteration.
+
+
+Part IV: Example
+=================
+
+The following code demonstrates how to register a driver to Net DIM. The actual
+usage is not complete but it should make the outline of the usage clear.
+
+my_driver.c:
+
+#include <linux/net_dim.h>
+
+/* Callback for net DIM to schedule on a decision to change moderation */
+void my_driver_do_dim_work(struct work_struct *work)
+{
+	/* Get struct net_dim from struct work_struct */
+	struct net_dim *dim = container_of(work, struct net_dim,
+					   work);
+	/* Do interrupt moderation related stuff */
+	...
+
+	/* Signal net DIM work is done and it should move to next iteration */
+	dim->state = NET_DIM_START_MEASURE;
+}
+
+/* My driver's interrupt handler */
+int my_driver_handle_interrupt(struct my_driver_entity *my_entity, ...)
+{
+	...
+	/* A struct to hold current measured data */
+	struct net_dim_sample dim_sample;
+	...
+	/* Initiate data sample struct with current data */
+	net_dim_sample(my_entity->events,
+		       my_entity->packets,
+		       my_entity->bytes,
+		       &dim_sample);
+	/* Call net DIM */
+	net_dim(&my_entity->dim, dim_sample);
+	...
+}
+
+/* My entity's initialization function (my_entity was already allocated) */
+int my_driver_init_my_entity(struct my_driver_entity *my_entity, ...)
+{
+	...
+	/* Initiate struct work_struct with my driver's callback function */
+	INIT_WORK(&my_entity->dim.work, my_driver_do_dim_work);
+	...
+}
diff --git a/Documentation/networking/nf_flowtable.txt b/Documentation/networking/nf_flowtable.txt
new file mode 100644
index 000000000000..54128c50d508
--- /dev/null
+++ b/Documentation/networking/nf_flowtable.txt
@@ -0,0 +1,112 @@
+Netfilter's flowtable infrastructure
+====================================
+
+This documentation describes the software flowtable infrastructure available in
+Netfilter since Linux kernel 4.16.
+
+Overview
+--------
+
+Initial packets follow the classic forwarding path, once the flow enters the
+established state according to the conntrack semantics (ie. we have seen traffic
+in both directions), then you can decide to offload the flow to the flowtable
+from the forward chain via the 'flow offload' action available in nftables.
+
+Packets that find an entry in the flowtable (ie. flowtable hit) are sent to the
+output netdevice via neigh_xmit(), hence, they bypass the classic forwarding
+path (the visible effect is that you do not see these packets from any of the
+netfilter hooks coming after the ingress). In case of flowtable miss, the packet
+follows the classic forward path.
+
+The flowtable uses a resizable hashtable, lookups are based on the following
+7-tuple selectors: source, destination, layer 3 and layer 4 protocols, source
+and destination ports and the input interface (useful in case there are several
+conntrack zones in place).
+
+Flowtables are populated via the 'flow offload' nftables action, so the user can
+selectively specify what flows are placed into the flow table. Hence, packets
+follow the classic forwarding path unless the user explicitly instruct packets
+to use this new alternative forwarding path via nftables policy.
+
+This is represented in Fig.1, which describes the classic forwarding path
+including the Netfilter hooks and the flowtable fastpath bypass.
+
+                                         userspace process
+                                          ^              |
+                                          |              |
+                                     _____|____     ____\/___
+                                    /          \   /         \
+                                    |   input   |  |  output  |
+                                    \__________/   \_________/
+                                         ^               |
+                                         |               |
+      _________      __________      ---------     _____\/_____
+     /         \    /          \     |Routing |   /            \
+  -->  ingress  ---> prerouting ---> |decision|   | postrouting |--> neigh_xmit
+     \_________/    \__________/     ----------   \____________/          ^
+       |      ^          |               |               ^                |
+   flowtable  |          |          ____\/___            |                |
+       |      |          |         /         \           |                |
+    __\/___   |          --------->| forward |------------                |
+    |-----|   |                    \_________/                            |
+    |-----|   |                 'flow offload' rule                       |
+    |-----|   |                   adds entry to                           |
+    |_____|   |                     flowtable                             |
+       |      |                                                           |
+      / \     |                                                           |
+     /hit\_no_|                                                           |
+     \ ? /                                                                |
+      \ /                                                                 |
+       |__yes_________________fastpath bypass ____________________________|
+
+               Fig.1 Netfilter hooks and flowtable interactions
+
+The flowtable entry also stores the NAT configuration, so all packets are
+mangled according to the NAT policy that matches the initial packets that went
+through the classic forwarding path. The TTL is decremented before calling
+neigh_xmit(). Fragmented traffic is passed up to follow the classic forwarding
+path given that the transport selectors are missing, therefore flowtable lookup
+is not possible.
+
+Example configuration
+---------------------
+
+Enabling the flowtable bypass is relatively easy, you only need to create a
+flowtable and add one rule to your forward chain.
+
+        table inet x {
+		flowtable f {
+			hook ingress priority 0 devices = { eth0, eth1 };
+		}
+                chain y {
+                        type filter hook forward priority 0; policy accept;
+                        ip protocol tcp flow offload @f
+                        counter packets 0 bytes 0
+                }
+        }
+
+This example adds the flowtable 'f' to the ingress hook of the eth0 and eth1
+netdevices. You can create as many flowtables as you want in case you need to
+perform resource partitioning. The flowtable priority defines the order in which
+hooks are run in the pipeline, this is convenient in case you already have a
+nftables ingress chain (make sure the flowtable priority is smaller than the
+nftables ingress chain hence the flowtable runs before in the pipeline).
+
+The 'flow offload' action from the forward chain 'y' adds an entry to the
+flowtable for the TCP syn-ack packet coming in the reply direction. Once the
+flow is offloaded, you will observe that the counter rule in the example above
+does not get updated for the packets that are being forwarded through the
+forwarding bypass.
+
+More reading
+------------
+
+This documentation is based on the LWN.net articles [1][2]. Rafal Milecki also
+made a very complete and comprehensive summary called "A state of network
+acceleration" that describes how things were before this infrastructure was
+mailined [3] and it also makes a rough summary of this work [4].
+
+[1] https://lwn.net/Articles/738214/
+[2] https://lwn.net/Articles/742164/
+[3] http://lists.infradead.org/pipermail/lede-dev/2018-January/010830.html
+[4] http://lists.infradead.org/pipermail/lede-dev/2018-January/010829.html
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index bf654845556e..999eb41da81d 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -7,15 +7,12 @@ socket interface on 2.4/2.6/3.x kernels. This type of sockets is used for
 i) capture network traffic with utilities like tcpdump, ii) transmit network
 traffic, or any other that needs raw access to network interface.
 
-You can find the latest version of this document at:
-    http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap
-
 Howto can be found at:
-    http://wiki.gnu-log.net (packet_mmap)
+    https://sites.google.com/site/packetmmap/
 
 Please send your comments to
     Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
-    Johann Baudy <johann.baudy@gnu-log.net>
+    Johann Baudy
 
 -------------------------------------------------------------------------------
 + Why use PACKET_MMAP
@@ -51,17 +48,8 @@ From the user standpoint, you should use the higher level libpcap library, which
 is a de facto standard, portable across nearly all operating systems
 including Win32. 
 
-Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include
-support for PACKET_MMAP, and also probably the libpcap included in your distribution. 
-
-I'm aware of two implementations of PACKET_MMAP in libpcap:
-
-    http://wiki.ipxwarzone.com/		     (by Simon Patarin, based on libpcap 0.6.2)
-    http://public.lanl.gov/cpw/              (by Phil Wood, based on lastest libpcap)
-
-The rest of this document is intended for people who want to understand
-the low level details or want to improve libpcap by including PACKET_MMAP
-support.
+Packet MMAP support was integrated into libpcap around the time of version 1.3.0;
+TPACKET_V3 support was added in version 1.5.0
 
 --------------------------------------------------------------------------------
 + How to use mmap() directly to improve capture process
@@ -174,7 +162,7 @@ As capture, each frame contains two parts:
  /* bind socket to eth0 */
  bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
 
- A complete tutorial is available at: http://wiki.gnu-log.net/
+ A complete tutorial is available at: https://sites.google.com/site/packetmmap/
 
 By default, the user should put data at :
  frame base + TPACKET_HDRLEN - sizeof(struct sockaddr_ll)
diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt
index 77ed00631c12..58b5ef75f1b7 100644
--- a/Documentation/networking/tls.txt
+++ b/Documentation/networking/tls.txt
@@ -48,6 +48,9 @@ the transmit and the receive into the kernel.
 
   setsockopt(sock, SOL_TLS, TLS_TX, &crypto_info, sizeof(crypto_info));
 
+Transmit and receive are set separately, but the setup is the same, using either
+TLS_TX or TLS_RX.
+
 Sending TLS application data
 ----------------------------
 
@@ -79,6 +82,28 @@ for memory), or the encryption will always succeed.  If send() returns
 -ENOMEM and some data was left on the socket buffer from a previous
 call using MSG_MORE, the MSG_MORE data is left on the socket buffer.
 
+Receiving TLS application data
+------------------------------
+
+After setting the TLS_RX socket option, all recv family socket calls
+are decrypted using TLS parameters provided.  A full TLS record must
+be received before decryption can happen.
+
+  char buffer[16384];
+  recv(sock, buffer, 16384);
+
+Received data is decrypted directly in to the user buffer if it is
+large enough, and no additional allocations occur.  If the userspace
+buffer is too small, data is decrypted in the kernel and copied to
+userspace.
+
+EINVAL is returned if the TLS version in the received message does not
+match the version passed in setsockopt.
+
+EMSGSIZE is returned if the received message is too big.
+
+EBADMSG is returned if decryption failed for any other reason.
+
 Send TLS control messages
 -------------------------
 
@@ -118,6 +143,43 @@ using a record of type @record_type.
 Control message data should be provided unencrypted, and will be
 encrypted by the kernel.
 
+Receiving TLS control messages
+------------------------------
+
+TLS control messages are passed in the userspace buffer, with message
+type passed via cmsg.  If no cmsg buffer is provided, an error is
+returned if a control message is received.  Data messages may be
+received without a cmsg buffer set.
+
+  char buffer[16384];
+  char cmsg[CMSG_SPACE(sizeof(unsigned char))];
+  struct msghdr msg = {0};
+  msg.msg_control = cmsg;
+  msg.msg_controllen = sizeof(cmsg);
+
+  struct iovec msg_iov;
+  msg_iov.iov_base = buffer;
+  msg_iov.iov_len = 16384;
+
+  msg.msg_iov = &msg_iov;
+  msg.msg_iovlen = 1;
+
+  int ret = recvmsg(sock, &msg, 0 /* flags */);
+
+  struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+  if (cmsg->cmsg_level == SOL_TLS &&
+      cmsg->cmsg_type == TLS_GET_RECORD_TYPE) {
+      int record_type = *((unsigned char *)CMSG_DATA(cmsg));
+      // Do something with record_type, and control message data in
+      // buffer.
+      //
+      // Note that record_type may be == to application data (23).
+  } else {
+      // Buffer contains application data.
+  }
+
+recv will never return data from mixed types of TLS records.
+
 Integrating in to userspace TLS library
 ---------------------------------------
 
@@ -126,10 +188,10 @@ layer of a userspace TLS library.
 
 A patchset to OpenSSL to use ktls as the record layer is here:
 
-https://github.com/Mellanox/tls-openssl
+https://github.com/Mellanox/openssl/commits/tls_rx2
 
 An example of calling send directly after a handshake using
 gnutls.  Since it doesn't implement a full record layer, control
 messages are not supported:
 
-https://github.com/Mellanox/tls-af_ktls_tool
+https://github.com/ktls/af_ktls-tool/commits/RX