diff options
56 files changed, 3707 insertions, 176 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a69ad39ee57e..e3202af72df5 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -17,22 +17,47 @@ #include <net/rtnetlink.h> #include <net/dst.h> #include <net/xfrm.h> +#include <net/xdp.h> #include <linux/veth.h> #include <linux/module.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/ptr_ring.h> +#include <linux/bpf_trace.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" +#define VETH_XDP_FLAG BIT(0) +#define VETH_RING_SIZE 256 +#define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) + +/* Separating two types of XDP xmit */ +#define VETH_XDP_TX BIT(0) +#define VETH_XDP_REDIR BIT(1) + struct pcpu_vstats { u64 packets; u64 bytes; struct u64_stats_sync syncp; }; +struct veth_rq { + struct napi_struct xdp_napi; + struct net_device *dev; + struct bpf_prog __rcu *xdp_prog; + struct xdp_mem_info xdp_mem; + bool rx_notify_masked; + struct ptr_ring xdp_ring; + struct xdp_rxq_info xdp_rxq; +}; + struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; - unsigned requested_headroom; + struct bpf_prog *_xdp_prog; + struct veth_rq *rq; + unsigned int requested_headroom; }; /* @@ -98,11 +123,67 @@ static const struct ethtool_ops veth_ethtool_ops = { .get_link_ksettings = veth_get_link_ksettings, }; +/* general routines */ + +static bool veth_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & VETH_XDP_FLAG; +} + +static void *veth_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); +} + +static void *veth_xdp_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr | VETH_XDP_FLAG); +} + +static void veth_ptr_free(void *ptr) +{ + if (veth_is_xdp_frame(ptr)) + xdp_return_frame(veth_ptr_to_xdp(ptr)); + else + kfree_skb(ptr); +} + +static void __veth_xdp_flush(struct veth_rq *rq) +{ + /* Write ptr_ring before reading rx_notify_masked */ + smp_mb(); + if (!rq->rx_notify_masked) { + rq->rx_notify_masked = true; + napi_schedule(&rq->xdp_napi); + } +} + +static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) +{ + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { + dev_kfree_skb_any(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + +static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, + struct veth_rq *rq, bool xdp) +{ + return __dev_forward_skb(dev, skb) ?: xdp ? + veth_xdp_rx(rq, skb) : + netif_rx(skb); +} + static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { - struct veth_priv *priv = netdev_priv(dev); + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; + bool rcv_xdp = false; + int rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); @@ -111,7 +192,16 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; } - if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { + rcv_priv = netdev_priv(rcv); + rxq = skb_get_queue_mapping(skb); + if (rxq < rcv->real_num_rx_queues) { + rq = &rcv_priv->rq[rxq]; + rcv_xdp = rcu_access_pointer(rq->xdp_prog); + if (rcv_xdp) + skb_record_rx_queue(skb, rxq); + } + + if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); u64_stats_update_begin(&stats->syncp); @@ -122,14 +212,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) drop: atomic64_inc(&priv->dropped); } + + if (rcv_xdp) + __veth_xdp_flush(rq); + rcu_read_unlock(); + return NETDEV_TX_OK; } -/* - * general routines - */ - static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -179,18 +270,502 @@ static void veth_set_multicast_list(struct net_device *dev) { } +static struct sk_buff *veth_build_skb(void *head, int headroom, int len, + int buflen) +{ + struct sk_buff *skb; + + if (!buflen) { + buflen = SKB_DATA_ALIGN(headroom + len) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + } + skb = build_skb(head, buflen); + if (!skb) + return NULL; + + skb_reserve(skb, headroom); + skb_put(skb, len); + + return skb; +} + +static int veth_select_rxq(struct net_device *dev) +{ + return smp_processor_id() % dev->real_num_rx_queues; +} + +static int veth_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct net_device *rcv; + unsigned int max_len; + struct veth_rq *rq; + int i, drops = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + rcv = rcu_dereference(priv->peer); + if (unlikely(!rcv)) + return -ENXIO; + + rcv_priv = netdev_priv(rcv); + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; + /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive + * side. This means an XDP program is loaded on the peer and the peer + * device is up. + */ + if (!rcu_access_pointer(rq->xdp_prog)) + return -ENXIO; + + max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; + + spin_lock(&rq->xdp_ring.producer_lock); + for (i = 0; i < n; i++) { + struct xdp_frame *frame = frames[i]; + void *ptr = veth_xdp_to_ptr(frame); + + if (unlikely(frame->len > max_len || + __ptr_ring_produce(&rq->xdp_ring, ptr))) { + xdp_return_frame_rx_napi(frame); + drops++; + } + } + spin_unlock(&rq->xdp_ring.producer_lock); + + if (flags & XDP_XMIT_FLUSH) + __veth_xdp_flush(rq); + + return n - drops; +} + +static void veth_xdp_flush(struct net_device *dev) +{ + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct net_device *rcv; + struct veth_rq *rq; + + rcu_read_lock(); + rcv = rcu_dereference(priv->peer); + if (unlikely(!rcv)) + goto out; + + rcv_priv = netdev_priv(rcv); + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; + /* xdp_ring is initialized on receive side? */ + if (unlikely(!rcu_access_pointer(rq->xdp_prog))) + goto out; + + __veth_xdp_flush(rq); +out: + rcu_read_unlock(); +} + +static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) +{ + struct xdp_frame *frame = convert_to_xdp_frame(xdp); + + if (unlikely(!frame)) + return -EOVERFLOW; + + return veth_xdp_xmit(dev, 1, &frame, 0); +} + +static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, + struct xdp_frame *frame, + unsigned int *xdp_xmit) +{ + void *hard_start = frame->data - frame->headroom; + void *head = hard_start - sizeof(struct xdp_frame); + int len = frame->len, delta = 0; + struct xdp_frame orig_frame; + struct bpf_prog *xdp_prog; + unsigned int headroom; + struct sk_buff *skb; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (likely(xdp_prog)) { + struct xdp_buff xdp; + u32 act; + + xdp.data_hard_start = hard_start; + xdp.data = frame->data; + xdp.data_end = frame->data + frame->len; + xdp.data_meta = frame->data - frame->metasize; + xdp.rxq = &rq->xdp_rxq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_PASS: + delta = frame->data - xdp.data; + len = xdp.data_end - xdp.data; + break; + case XDP_TX: + orig_frame = *frame; + xdp.data_hard_start = head; + xdp.rxq->mem = frame->mem; + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { + trace_xdp_exception(rq->dev, xdp_prog, act); + frame = &orig_frame; + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_TX; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + orig_frame = *frame; + xdp.data_hard_start = head; + xdp.rxq->mem = frame->mem; + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + frame = &orig_frame; + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_REDIR; + rcu_read_unlock(); + goto xdp_xmit; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + trace_xdp_exception(rq->dev, xdp_prog, act); + case XDP_DROP: + goto err_xdp; + } + } + rcu_read_unlock(); + + headroom = sizeof(struct xdp_frame) + frame->headroom - delta; + skb = veth_build_skb(head, headroom, len, 0); + if (!skb) { + xdp_return_frame(frame); + goto err; + } + + xdp_scrub_frame(frame); + skb->protocol = eth_type_trans(skb, rq->dev); +err: + return skb; +err_xdp: + rcu_read_unlock(); + xdp_return_frame(frame); +xdp_xmit: + return NULL; +} + +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, + unsigned int *xdp_xmit) +{ + u32 pktlen, headroom, act, metalen; + void *orig_data, *orig_data_end; + struct bpf_prog *xdp_prog; + int mac_len, delta, off; + struct xdp_buff xdp; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (unlikely(!xdp_prog)) { + rcu_read_unlock(); + goto out; + } + + mac_len = skb->data - skb_mac_header(skb); + pktlen = skb->len + mac_len; + headroom = skb_headroom(skb) - mac_len; + + if (skb_shared(skb) || skb_head_is_locked(skb) || + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { + struct sk_buff *nskb; + int size, head_off; + void *head, *start; + struct page *page; + + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + if (size > PAGE_SIZE) + goto drop; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (!page) + goto drop; + + head = page_address(page); + start = head + VETH_XDP_HEADROOM; + if (skb_copy_bits(skb, -mac_len, start, pktlen)) { + page_frag_free(head); + goto drop; + } + + nskb = veth_build_skb(head, + VETH_XDP_HEADROOM + mac_len, skb->len, + PAGE_SIZE); + if (!nskb) { + page_frag_free(head); + goto drop; + } + + skb_copy_header(nskb, skb); + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + consume_skb(skb); + skb = nskb; + } + + xdp.data_hard_start = skb->head; + xdp.data = skb_mac_header(skb); + xdp.data_end = xdp.data + pktlen; + xdp.data_meta = xdp.data; + xdp.rxq = &rq->xdp_rxq; + orig_data = xdp.data; + orig_data_end = xdp.data_end; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + get_page(virt_to_page(xdp.data)); + consume_skb(skb); + xdp.rxq->mem = rq->xdp_mem; + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { + trace_xdp_exception(rq->dev, xdp_prog, act); + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_TX; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + get_page(virt_to_page(xdp.data)); + consume_skb(skb); + xdp.rxq->mem = rq->xdp_mem; + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) + goto err_xdp; + *xdp_xmit |= VETH_XDP_REDIR; + rcu_read_unlock(); + goto xdp_xmit; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + trace_xdp_exception(rq->dev, xdp_prog, act); + case XDP_DROP: + goto drop; + } + rcu_read_unlock(); + + delta = orig_data - xdp.data; + off = mac_len + delta; + if (off > 0) + __skb_push(skb, off); + else if (off < 0) + __skb_pull(skb, -off); + skb->mac_header -= delta; + off = xdp.data_end - orig_data_end; + if (off != 0) + __skb_put(skb, off); + skb->protocol = eth_type_trans(skb, rq->dev); + + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); +out: + return skb; +drop: + rcu_read_unlock(); + kfree_skb(skb); + return NULL; +err_xdp: + rcu_read_unlock(); + page_frag_free(xdp.data); +xdp_xmit: + return NULL; +} + +static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) +{ + int i, done = 0; + + for (i = 0; i < budget; i++) { + void *ptr = __ptr_ring_consume(&rq->xdp_ring); + struct sk_buff *skb; + + if (!ptr) + break; + + if (veth_is_xdp_frame(ptr)) { + skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), + xdp_xmit); + } else { + skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); + } + + if (skb) + napi_gro_receive(&rq->xdp_napi, skb); + + done++; + } + + return done; +} + +static int veth_poll(struct napi_struct *napi, int budget) +{ + struct veth_rq *rq = + container_of(napi, struct veth_rq, xdp_napi); + unsigned int xdp_xmit = 0; + int done; + + xdp_set_return_frame_no_direct(); + done = veth_xdp_rcv(rq, budget, &xdp_xmit); + + if (done < budget && napi_complete_done(napi, done)) { + /* Write rx_notify_masked before reading ptr_ring */ + smp_store_mb(rq->rx_notify_masked, false); + if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { + rq->rx_notify_masked = true; + napi_schedule(&rq->xdp_napi); + } + } + + if (xdp_xmit & VETH_XDP_TX) + veth_xdp_flush(rq->dev); + if (xdp_xmit & VETH_XDP_REDIR) + xdp_do_flush_map(); + xdp_clear_return_frame_no_direct(); + + return done; +} + +static int veth_napi_add(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); + if (err) + goto err_xdp_ring; + } + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + napi_enable(&rq->xdp_napi); + } + + return 0; +err_xdp_ring: + for (i--; i >= 0; i--) + ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); + + return err; +} + +static void veth_napi_del(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + napi_disable(&rq->xdp_napi); + napi_hash_del(&rq->xdp_napi); + } + synchronize_net(); + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_del(&rq->xdp_napi); + rq->rx_notify_masked = false; + ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); + } +} + +static int veth_enable_xdp(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_reg; + + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err < 0) + goto err_reg_mem; + + /* Save original mem info as it can be overwritten */ + rq->xdp_mem = rq->xdp_rxq.mem; + } + + err = veth_napi_add(dev); + if (err) + goto err_rxq_reg; + } + + for (i = 0; i < dev->real_num_rx_queues; i++) + rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); + + return 0; +err_reg_mem: + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); +err_rxq_reg: + for (i--; i >= 0; i--) + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + + return err; +} + +static void veth_disable_xdp(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) + rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); + veth_napi_del(dev); + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + rq->xdp_rxq.mem = rq->xdp_mem; + xdp_rxq_info_unreg(&rq->xdp_rxq); + } +} + static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); + int err; if (!peer) return -ENOTCONN; + if (priv->_xdp_prog) { + err = veth_enable_xdp(dev); + if (err) + return err; + } + if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } + return 0; } @@ -203,6 +778,9 @@ static int veth_close(struct net_device *dev) if (peer) netif_carrier_off(peer); + if (priv->_xdp_prog) + veth_disable_xdp(dev); + return 0; } @@ -228,7 +806,7 @@ static void veth_dev_free(struct net_device *dev) static void veth_poll_controller(struct net_device *dev) { /* veth only receives frames when its peer sends one - * Since it's a synchronous operation, we are guaranteed + * Since it has nothing to do with disabling irqs, we are guaranteed * never to have pending data when we poll for it so * there is nothing to do here. * @@ -253,6 +831,23 @@ static int veth_get_iflink(const struct net_device *dev) return iflink; } +static netdev_features_t veth_fix_features(struct net_device *dev, + netdev_features_t features) +{ + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + + peer = rtnl_dereference(priv->peer); + if (peer) { + struct veth_priv *peer_priv = netdev_priv(peer); + + if (peer_priv->_xdp_prog) + features &= ~NETIF_F_GSO_SOFTWARE; + } + + return features; +} + static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); @@ -276,6 +871,103 @@ out: rcu_read_unlock(); } +static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct veth_priv *priv = netdev_priv(dev); + struct bpf_prog *old_prog; + struct net_device *peer; + unsigned int max_mtu; + int err; + + old_prog = priv->_xdp_prog; + priv->_xdp_prog = prog; + peer = rtnl_dereference(priv->peer); + + if (prog) { + if (!peer) { + NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); + err = -ENOTCONN; + goto err; + } + + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - + peer->hard_header_len - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + if (peer->mtu > max_mtu) { + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); + err = -ERANGE; + goto err; + } + + if (dev->real_num_rx_queues < peer->real_num_tx_queues) { + NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); + err = -ENOSPC; + goto err; + } + + if (dev->flags & IFF_UP) { + err = veth_enable_xdp(dev); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); + goto err; + } + } + + if (!old_prog) { + peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; + peer->max_mtu = max_mtu; + } + } + + if (old_prog) { + if (!prog) { + if (dev->flags & IFF_UP) + veth_disable_xdp(dev); + + if (peer) { + peer->hw_features |= NETIF_F_GSO_SOFTWARE; + peer->max_mtu = ETH_MAX_MTU; + } + } + bpf_prog_put(old_prog); + } + + if ((!!old_prog ^ !!prog) && peer) + netdev_update_features(peer); + + return 0; +err: + priv->_xdp_prog = old_prog; + + return err; +} + +static u32 veth_xdp_query(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + const struct bpf_prog *xdp_prog; + + xdp_prog = priv->_xdp_prog; + if (xdp_prog) + return xdp_prog->aux->id; + + return 0; +} + +static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return veth_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_QUERY_PROG: + xdp->prog_id = veth_xdp_query(dev); + return 0; + default: + return -EINVAL; + } +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -288,8 +980,11 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_poll_controller = veth_poll_controller, #endif .ndo_get_iflink = veth_get_iflink, + .ndo_fix_features = veth_fix_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, + .ndo_bpf = veth_xdp, + .ndo_xdp_xmit = veth_xdp_xmit, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ @@ -345,13 +1040,31 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } +static int veth_alloc_queues(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); + if (!priv->rq) + return -ENOMEM; + + return 0; +} + +static void veth_free_queues(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + kfree(priv->rq); +} + static struct rtnl_link_ops veth_link_ops; static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - int err; + int err, i; struct net_device *peer; struct veth_priv *priv; char ifname[IFNAMSIZ]; @@ -404,6 +1117,12 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, return PTR_ERR(peer); } + err = veth_alloc_queues(peer); + if (err) { + put_net(net); + goto err_peer_alloc_queues; + } + if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); @@ -432,6 +1151,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, * should be re-allocated */ + err = veth_alloc_queues(dev); + if (err) + goto err_alloc_queues; + if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); @@ -451,19 +1174,28 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, */ priv = netdev_priv(dev); + for (i = 0; i < dev->real_num_rx_queues; i++) + priv->rq[i].dev = dev; rcu_assign_pointer(priv->peer, peer); priv = netdev_priv(peer); + for (i = 0; i < peer->real_num_rx_queues; i++) + priv->rq[i].dev = peer; rcu_assign_pointer(priv->peer, dev); + return 0; err_register_dev: + veth_free_queues(dev); +err_alloc_queues: /* nothing to do */ err_configure_peer: unregister_netdevice(peer); return err; err_register_peer: + veth_free_queues(peer); +err_peer_alloc_queues: free_netdev(peer); return err; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..523481a3471b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -48,8 +48,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ @@ -524,6 +529,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +775,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) @@ -60,4 +63,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9fdf6f57913..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -554,6 +554,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, } /** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + +/** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -537,6 +538,20 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; + u32 kern_flags; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -738,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); @@ -765,6 +781,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { @@ -798,6 +835,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7e237a63a70c..17a13e4785fc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5f43f7a70fe6..6def0351bcc3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0054b3a9b923..8a5f70c7cdf2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -5,25 +5,36 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/types.h> +#include <linux/spinlock.h> #include <net/sock.h> +extern spinlock_t reuseport_lock; + struct sock_reuseport { struct rcu_head rcu; u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ + /* The last synq overflow event timestamp of this + * reuse->socks[] group. + */ + unsigned int synq_overflow_ts; + /* ID stays the same even after the size of socks[] grows. */ + unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); -extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, - struct bpf_prog *prog); +extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); +int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d769dc20359b..d196901c9dba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -36,6 +36,7 @@ #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> +#include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> @@ -473,9 +474,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + if (time_after32(now, last_overflow + HZ)) + WRITE_ONCE(reuse->synq_overflow_ts, now); + return; + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } @@ -483,9 +497,21 @@ static inline void tcp_synq_overflow(const struct sock *sk) /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + return time_after32(now, last_overflow + + TCP_SYNCOOKIE_VALID); + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } diff --git a/include/net/xdp.h b/include/net/xdp.h index fcb033f51d8c..76b95256c266 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -84,6 +84,13 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +/* Clear kernel pointers in xdp_frame */ +static inline void xdp_scrub_frame(struct xdp_frame *frame) +{ + frame->data = NULL; + frame->dev_rx = NULL; +} + /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd5758dc35d3..66917a4eba27 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { @@ -150,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2091,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based @@ -2113,6 +2133,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2196,7 +2224,9 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), \ + FN(skb_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2413,6 +2443,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c77..0c17aab3ce5f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 46f5f29605d4..620bc5024d7d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 750d45edae79..ac1df79f3788 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -488,6 +488,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 513d9dfcf4ee..04b8eda94e7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -11,9 +11,11 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> +#include <uapi/linux/btf.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -1162,6 +1164,27 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } +static void htab_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = htab_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1171,6 +1194,7 @@ const struct bpf_map_ops htab_map_ops = { .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1182,6 +1206,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, }; /* Called from eBPF program */ @@ -1408,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 76efe9a183f5..2ada5e21dfa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; + void *prev_key; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) - goto done; + prev_key = NULL; + else + prev_key = key; - if (map->ops->map_get_next_key(map, key, key)) { + if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; return NULL; } -done: ++(*pos); return key; } @@ -332,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc4e37f68f2a..22ad967d1e5f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1603492c9cc7..9058317ba9de 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <net/ipv6.h> +#include <uapi/linux/btf.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -686,6 +688,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/sock_diag.h> +#include <net/sock_reuseport.h> + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 70c0755e8fc4..0c1a696b041b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2498,6 +2498,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2508,6 +2509,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b675a3f3d141..8061a439ef18 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722d..43727ed0d94a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -455,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -489,8 +518,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -504,8 +532,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; @@ -684,6 +712,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +820,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..4ddf61e158f6 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - diff --git a/net/core/filter.c b/net/core/filter.c index 587bbfbd7db3..15b9d2df92ca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) -{ - struct bpf_prog *old_prog; - int err; - - if (bpf_prog_size(prog->len) > sysctl_optmem_max) - return -ENOMEM; - - if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); - if (err) - return err; - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { - /* The socket wasn't bound with SO_REUSEPORT */ - return -EINVAL; - } - - old_prog = reuseport_attach_prog(sk, prog); - if (old_prog) - bpf_prog_destroy(old_prog); - - return 0; -} - static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) __bpf_prog_release(prog); - return err; - } - return 0; + return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) @@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog = __get_bpf(ufd, sk); + struct bpf_prog *prog; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - bpf_prog_put(prog); - return err; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } } - return 0; + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); } struct bpf_scratchpad { @@ -2082,19 +2099,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -struct redirect_info { - u32 ifindex; - u32 flags; - struct bpf_map *map; - struct bpf_map *map_to_flush; - unsigned long map_owner; -}; - -static DEFINE_PER_CPU(struct redirect_info, redirect_info); +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; @@ -2107,7 +2117,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) int skb_do_redirect(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); @@ -3200,7 +3210,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; @@ -3245,7 +3255,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3285,7 +3295,7 @@ err: int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *fwd; u32 index = ri->ifindex; int err; @@ -3317,7 +3327,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3368,7 +3378,7 @@ err: int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; @@ -3399,7 +3409,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3423,7 +3433,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, unsigned long, map_owner) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3768,6 +3778,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *ancestor; + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return ancestor->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4956,6 +4992,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif default: return bpf_base_func_proto(func_id); @@ -7020,3 +7058,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8d574a88125d..c996c09d095f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); -static void skb_headers_offset_update(struct sk_buff *skb, int off) +void skb_headers_offset_update(struct sk_buff *skb, int off) { /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_network_header += off; skb->inner_mac_header += off; } +EXPORT_SYMBOL(skb_headers_offset_update); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 064acb04be0f..ba5cba56f574 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -8,11 +8,34 @@ #include <net/sock_reuseport.h> #include <linux/bpf.h> +#include <linux/idr.h> +#include <linux/filter.h> #include <linux/rcupdate.h> #define INIT_SOCKS 128 -static DEFINE_SPINLOCK(reuseport_lock); +DEFINE_SPINLOCK(reuseport_lock); + +#define REUSEPORT_MIN_ID 1 +static DEFINE_IDA(reuseport_ida); + +int reuseport_get_id(struct sock_reuseport *reuse) +{ + int id; + + if (reuse->reuseport_id) + return reuse->reuseport_id; + + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, + /* Called under reuseport_lock */ + GFP_ATOMIC); + if (id < 0) + return id; + + reuse->reuseport_id = id; + + return reuse->reuseport_id; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); for (i = 0; i < reuse->num_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, @@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head) struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); + if (reuse->reuseport_id) + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } @@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; @@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) { @@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk) } EXPORT_SYMBOL(reuseport_detach_sock); -static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, - struct bpf_prog *prog, struct sk_buff *skb, - int hdr_len) +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) { struct sk_buff *nskb = NULL; u32 index; @@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk, /* paired with smp_wmb() in reuseport_add_sock() */ smp_rmb(); - if (prog && skb) - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); +select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; @@ -252,12 +302,21 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); -struct bpf_prog * -reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; + if (sk_unhashed(sk) && sk->sk_reuseport) { + int err = reuseport_alloc(sk, false); + + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); @@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); - return old_prog; + sk_reuseport_prog_free(old_prog); + return 0; } EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/core/xdp.c b/net/core/xdp.c index c013b836006b..3dd99e1c04f5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) + if (xa) { + napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); - else + } else { put_page(page); + } rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: @@ -348,8 +350,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, rcu_read_lock(); /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - if (!WARN_ON_ONCE(!xa)) - xa->zc_alloc->free(xa->zc_alloc, handle); + xa->zc_alloc->free(xa->zc_alloc, handle); rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3647167c8fa3..f5c9ef2586de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each_rcu(sk, &ilb->head) { @@ -352,12 +353,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 060e841dde40..f4e35b2ff8b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** @@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -512,6 +515,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 595ad408dba0..3d7c7460a0c5 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with in6addr_any */ @@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each(sk, &ilb->head) { @@ -214,12 +215,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6b96956a8ed..83f4c77c79d8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -249,6 +251,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h new file mode 100644 index 000000000000..38255812e376 --- /dev/null +++ b/samples/bpf/hash_func01.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1 + * + * Based on Paul Hsieh's (LGPG 2.1) hash function + * From: http://www.azillionmonkeys.com/qed/hash.html + */ + +#define get16bits(d) (*((const __u16 *) (d))) + +static __always_inline +__u32 SuperFastHash (const char *data, int len, __u32 initval) { + __u32 hash = initval; + __u32 tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ +#pragma clang loop unroll(full) + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (__u16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= ((signed char)data[sizeof (__u16)]) << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += (signed char)*data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c index 0cc3d71057f0..a306d1c75622 100644 --- a/samples/bpf/xdp_redirect_cpu_kern.c +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -13,6 +13,7 @@ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" +#include "hash_func01.h" #define MAX_CPUS 64 /* WARNING - sync with _user.c */ @@ -461,6 +462,108 @@ int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) return bpf_redirect_map(&cpu_map, cpu_dest, 0); } +/* Hashing initval */ +#define INITVAL 15485863 + +static __always_inline +u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + u32 cpu_hash; + + if (iph + 1 > data_end) + return 0; + + cpu_hash = iph->saddr + iph->daddr; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); + + return cpu_hash; +} + +static __always_inline +u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + u32 cpu_hash; + + if (ip6h + 1 > data_end) + return 0; + + cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; + cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; + cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; + cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); + + return cpu_hash; +} + +/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The + * hashing scheme is symmetric, meaning swapping IP src/dest still hit + * same CPU. + */ +SEC("xdp_cpu_map5_lb_hash_ip_pairs") +int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 *cpu_max; + u32 cpu_hash; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key); + if (!cpu_max) + return XDP_ABORTED; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Hash for IPv4 and IPv6 */ + switch (eth_proto) { + case ETH_P_IP: + cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_IPV6: + cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ + default: + cpu_hash = 0; + } + + /* Choose CPU based on hash */ + cpu_idx = cpu_hash % *cpu_max; + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 4b4d78fffe30..9a6c7e0a6dd1 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -22,7 +22,7 @@ static const char *__doc__ = #define MAX_CPUS 64 /* WARNING - sync with _kern.c */ /* How many xdp_progs are defined in _kern.c */ -#define MAX_PROG 5 +#define MAX_PROG 6 /* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead * use bpf/libbpf.h), but cannot as (currently) needed for XDP @@ -567,7 +567,7 @@ int main(int argc, char **argv) int added_cpus = 0; int longindex = 0; int interval = 2; - int prog_num = 0; + int prog_num = 5; int add_cpu = -1; __u32 qsize; int opt; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dd5758dc35d3..66917a4eba27 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { @@ -150,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2091,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based @@ -2113,6 +2133,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2196,7 +2224,9 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), \ + FN(skb_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2413,6 +2443,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 9ddc89dae962..60aa4ca8b2c5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -92,6 +92,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) attr.btf_key_type_id = create_attr->btf_key_type_id; attr.btf_value_type_id = create_attr->btf_value_type_id; attr.map_ifindex = create_attr->map_ifindex; + attr.inner_map_fd = create_attr->inner_map_fd; return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 0639a30a457d..6f38164b2618 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -39,6 +39,7 @@ struct bpf_create_map_attr { __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 map_ifindex; + __u32 inner_map_fd; }; int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 40211b51427a..2abd0f112627 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1501,6 +1501,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_SK_REUSEPORT: return false; case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_KPROBE: diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 17a7a5818ee1..fff7fb1285fc 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ - test_socket_cookie test_cgroup_storage + test_socket_cookie test_cgroup_storage test_select_reuseport TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -34,7 +34,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ - get_cgroup_id_kern.o socket_cookie_prog.o + get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ + test_skb_cgroup_id_kern.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -45,10 +46,11 @@ TEST_PROGS := test_kmod.sh \ test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ - test_lirc_mode2.sh + test_lirc_mode2.sh \ + test_skb_cgroup_id.sh # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr +TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user include ../lib.mk @@ -59,6 +61,7 @@ $(TEST_GEN_PROGS): $(BPFOBJ) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c +$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c $(OUTPUT)/test_socket_cookie: cgroup_helpers.c diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 9ba1c72d7cf5..e4be7730222d 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -111,6 +111,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, int size, int flags) = (void *) BPF_FUNC_skb_get_xfrm_state; +static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = + (void *) BPF_FUNC_sk_select_reuseport; static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = (void *) BPF_FUNC_get_stack; static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, @@ -137,6 +139,10 @@ static unsigned long long (*bpf_get_current_cgroup_id)(void) = (void *) BPF_FUNC_get_current_cgroup_id; static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = (void *) BPF_FUNC_get_local_storage; +static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = + (void *) BPF_FUNC_skb_cgroup_id; +static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = + (void *) BPF_FUNC_skb_ancestor_cgroup_id; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions @@ -173,6 +179,8 @@ struct bpf_map_def { static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = (void *) BPF_FUNC_skb_load_bytes; +static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = + (void *) BPF_FUNC_skb_load_bytes_relative; static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = (void *) BPF_FUNC_skb_store_bytes; static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index d0811b3d6a6f..315a44fa32af 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -44,4 +44,8 @@ static inline unsigned int bpf_num_possible_cpus(void) name[bpf_num_possible_cpus()] #define bpf_percpu(name, cpu) name[(cpu)].v +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + #endif /* __BPF_UTIL__ */ diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 6b1b302310fe..5f377ec53f2f 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -18,10 +18,7 @@ #include "../../../include/linux/filter.h" #include "bpf_rlimit.h" - -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif +#include "bpf_util.h" #define MAX_INSNS 512 #define MAX_MATCHES 16 diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index ffdd27737c9e..6b5cfeb7a9cc 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -19,6 +19,7 @@ #include <bpf/btf.h> #include "bpf_rlimit.h" +#include "bpf_util.h" static uint32_t pass_cnt; static uint32_t error_cnt; @@ -93,10 +94,6 @@ static int __base_pr(const char *format, ...) #define MAX_NR_RAW_TYPES 1024 #define BTF_LOG_BUF_SIZE 65535 -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - static struct args { unsigned int raw_test_num; unsigned int file_test_num; @@ -131,6 +128,8 @@ struct btf_raw_test { __u32 max_entries; bool btf_load_err; bool map_create_err; + bool ordered_map; + bool lossless_map; int hdr_len_delta; int type_off_delta; int str_off_delta; @@ -2093,8 +2092,7 @@ struct pprint_mapv { } aenum; }; -static struct btf_raw_test pprint_test = { - .descr = "BTF pretty print test #1", +static struct btf_raw_test pprint_test_template = { .raw_types = { /* unsighed char */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), @@ -2146,8 +2144,6 @@ static struct btf_raw_test pprint_test = { }, .str_sec = "\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum", .str_sec_size = sizeof("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"), - .map_type = BPF_MAP_TYPE_ARRAY, - .map_name = "pprint_test", .key_size = sizeof(unsigned int), .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ @@ -2155,6 +2151,40 @@ static struct btf_raw_test pprint_test = { .max_entries = 128 * 1024, }; +static struct btf_pprint_test_meta { + const char *descr; + enum bpf_map_type map_type; + const char *map_name; + bool ordered_map; + bool lossless_map; +} pprint_tests_meta[] = { +{ + .descr = "BTF pretty print array", + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "pprint_test_array", + .ordered_map = true, + .lossless_map = true, +}, + +{ + .descr = "BTF pretty print hash", + .map_type = BPF_MAP_TYPE_HASH, + .map_name = "pprint_test_hash", + .ordered_map = false, + .lossless_map = true, +}, + +{ + .descr = "BTF pretty print lru hash", + .map_type = BPF_MAP_TYPE_LRU_HASH, + .map_name = "pprint_test_lru_hash", + .ordered_map = false, + .lossless_map = false, +}, + +}; + + static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) { v->ui32 = i; @@ -2166,10 +2196,12 @@ static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) v->aenum = i & 0x03; } -static int test_pprint(void) +static int do_test_pprint(void) { - const struct btf_raw_test *test = &pprint_test; + const struct btf_raw_test *test = &pprint_test_template; struct bpf_create_map_attr create_attr = {}; + unsigned int key, nr_read_elems; + bool ordered_map, lossless_map; int map_fd = -1, btf_fd = -1; struct pprint_mapv mapv = {}; unsigned int raw_btf_size; @@ -2178,7 +2210,6 @@ static int test_pprint(void) char pin_path[255]; size_t line_len = 0; char *line = NULL; - unsigned int key; uint8_t *raw_btf; ssize_t nread; int err, ret; @@ -2251,14 +2282,18 @@ static int test_pprint(void) goto done; } - key = 0; + nr_read_elems = 0; + ordered_map = test->ordered_map; + lossless_map = test->lossless_map; do { ssize_t nexpected_line; + unsigned int next_key; - set_pprint_mapv(&mapv, key); + next_key = ordered_map ? nr_read_elems : atoi(line); + set_pprint_mapv(&mapv, next_key); nexpected_line = snprintf(expected_line, sizeof(expected_line), "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", - key, + next_key, mapv.ui32, mapv.si32, mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b, mapv.ui64, @@ -2281,11 +2316,12 @@ static int test_pprint(void) } nread = getline(&line, &line_len, pin_file); - } while (++key < test->max_entries && nread > 0); + } while (++nr_read_elems < test->max_entries && nread > 0); - if (CHECK(key < test->max_entries, - "Unexpected EOF. key:%u test->max_entries:%u", - key, test->max_entries)) { + if (lossless_map && + CHECK(nr_read_elems < test->max_entries, + "Unexpected EOF. nr_read_elems:%u test->max_entries:%u", + nr_read_elems, test->max_entries)) { err = -1; goto done; } @@ -2314,6 +2350,24 @@ done: return err; } +static int test_pprint(void) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) { + pprint_test_template.descr = pprint_tests_meta[i].descr; + pprint_test_template.map_type = pprint_tests_meta[i].map_type; + pprint_test_template.map_name = pprint_tests_meta[i].map_name; + pprint_test_template.ordered_map = pprint_tests_meta[i].ordered_map; + pprint_test_template.lossless_map = pprint_tests_meta[i].lossless_map; + + err |= count_result(do_test_pprint()); + } + + return err; +} + static void usage(const char *cmd) { fprintf(stderr, "Usage: %s [-l] [[-r test_num (1 - %zu)] | [-g test_num (1 - %zu)] | [-f test_num (1 - %zu)] | [-p]]\n", @@ -2409,7 +2463,7 @@ int main(int argc, char **argv) err |= test_file(); if (args.pprint_test) - err |= count_result(test_pprint()); + err |= test_pprint(); if (args.raw_test || args.get_info_test || args.file_test || args.pprint_test) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 6c253343a6f9..4b7c74f5faa7 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -17,7 +17,8 @@ #include <stdlib.h> #include <sys/wait.h> - +#include <sys/socket.h> +#include <netinet/in.h> #include <linux/bpf.h> #include <bpf/bpf.h> @@ -26,8 +27,21 @@ #include "bpf_util.h" #include "bpf_rlimit.h" +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + static int map_flags; +#define CHECK(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ + printf(format); \ + exit(-1); \ + } \ +}) + static void test_hashmap(int task, void *data) { long long key, next_key, first_key, value; @@ -1150,6 +1164,250 @@ static void test_map_wronly(void) assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); } +static void prepare_reuseport_grp(int type, int map_fd, + __s64 *fds64, __u64 *sk_cookies, + unsigned int n) +{ + socklen_t optlen, addrlen; + struct sockaddr_in6 s6; + const __u32 index0 = 0; + const int optval = 1; + unsigned int i; + u64 sk_cookie; + __s64 fd64; + int err; + + s6.sin6_family = AF_INET6; + s6.sin6_addr = in6addr_any; + s6.sin6_port = 0; + addrlen = sizeof(s6); + optlen = sizeof(sk_cookie); + + for (i = 0; i < n; i++) { + fd64 = socket(AF_INET6, type, 0); + CHECK(fd64 == -1, "socket()", + "sock_type:%d fd64:%lld errno:%d\n", + type, fd64, errno); + + err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof(optval)); + CHECK(err == -1, "setsockopt(SO_REUSEEPORT)", + "err:%d errno:%d\n", err, errno); + + /* reuseport_array does not allow unbound sk */ + err = bpf_map_update_elem(map_fd, &index0, &fd64, + BPF_ANY); + CHECK(err != -1 || errno != EINVAL, + "reuseport array update unbound sk", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6)); + CHECK(err == -1, "bind()", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + if (i == 0) { + err = getsockname(fd64, (struct sockaddr *)&s6, + &addrlen); + CHECK(err == -1, "getsockname()", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + } + + err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie, + &optlen); + CHECK(err == -1, "getsockopt(SO_COOKIE)", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + if (type == SOCK_STREAM) { + /* + * reuseport_array does not allow + * non-listening tcp sk. + */ + err = bpf_map_update_elem(map_fd, &index0, &fd64, + BPF_ANY); + CHECK(err != -1 || errno != EINVAL, + "reuseport array update non-listening sk", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + err = listen(fd64, 0); + CHECK(err == -1, "listen()", + "sock_type:%d, err:%d errno:%d\n", + type, err, errno); + } + + fds64[i] = fd64; + sk_cookies[i] = sk_cookie; + } +} + +static void test_reuseport_array(void) +{ +#define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; }) + + const __u32 array_size = 4, index0 = 0, index3 = 3; + int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type; + __u64 grpa_cookies[2], sk_cookie, map_cookie; + __s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1; + const __u32 bad_index = array_size; + int map_fd, err, t, f; + __u32 fds_idx = 0; + int fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(__u32), sizeof(__u64), array_size, 0); + CHECK(map_fd == -1, "reuseport array create", + "map_fd:%d, errno:%d\n", map_fd, errno); + + /* Test lookup/update/delete with invalid index */ + err = bpf_map_delete_elem(map_fd, &bad_index); + CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries", + "err:%d errno:%d\n", err, errno); + + err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY); + CHECK(err != -1 || errno != E2BIG, + "reuseport array update >=max_entries", + "err:%d errno:%d\n", err, errno); + + err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie); + CHECK(err != -1 || errno != ENOENT, + "reuseport array update >=max_entries", + "err:%d errno:%d\n", err, errno); + + /* Test lookup/delete non existence elem */ + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err != -1 || errno != ENOENT, + "reuseport array lookup not-exist elem", + "err:%d errno:%d\n", err, errno); + err = bpf_map_delete_elem(map_fd, &index3); + CHECK(err != -1 || errno != ENOENT, + "reuseport array del not-exist elem", + "err:%d errno:%d\n", err, errno); + + for (t = 0; t < ARRAY_SIZE(types); t++) { + type = types[t]; + + prepare_reuseport_grp(type, map_fd, grpa_fds64, + grpa_cookies, ARRAY_SIZE(grpa_fds64)); + + /* Test BPF_* update flags */ + /* BPF_EXIST failure case */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_EXIST); + CHECK(err != -1 || errno != ENOENT, + "reuseport array update empty elem BPF_EXIST", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_NOEXIST success case */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_NOEXIST); + CHECK(err == -1, + "reuseport array update empty elem BPF_NOEXIST", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_EXIST success case. */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_EXIST); + CHECK(err == -1, + "reuseport array update same elem BPF_EXIST", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_NOEXIST failure case */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_NOEXIST); + CHECK(err != -1 || errno != EEXIST, + "reuseport array update non-empty elem BPF_NOEXIST", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_ANY case (always succeed) */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_ANY); + CHECK(err == -1, + "reuseport array update same sk with BPF_ANY", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + fd64 = grpa_fds64[fds_idx]; + sk_cookie = grpa_cookies[fds_idx]; + + /* The same sk cannot be added to reuseport_array twice */ + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY); + CHECK(err != -1 || errno != EBUSY, + "reuseport array update same sk with same index", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY); + CHECK(err != -1 || errno != EBUSY, + "reuseport array update same sk with different index", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + /* Test delete elem */ + err = bpf_map_delete_elem(map_fd, &index3); + CHECK(err == -1, "reuseport array delete sk", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + /* Add it back with BPF_NOEXIST */ + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); + CHECK(err == -1, + "reuseport array re-add with BPF_NOEXIST after del", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + /* Test cookie */ + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err == -1 || sk_cookie != map_cookie, + "reuseport array lookup re-added sk", + "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn", + type, err, errno, sk_cookie, map_cookie); + + /* Test elem removed by close() */ + for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++) + close(grpa_fds64[f]); + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err != -1 || errno != ENOENT, + "reuseport array lookup after close()", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + } + + /* Test SOCK_RAW */ + fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP); + CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n", + err, errno); + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); + CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW", + "err:%d errno:%d\n", err, errno); + close(fd64); + + /* Close the 64 bit value map */ + close(map_fd); + + /* Test 32 bit fd */ + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(__u32), sizeof(__u32), array_size, 0); + CHECK(map_fd == -1, "reuseport array create", + "map_fd:%d, errno:%d\n", map_fd, errno); + prepare_reuseport_grp(SOCK_STREAM, map_fd, &fd64, &sk_cookie, 1); + fd = fd64; + err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST); + CHECK(err == -1, "reuseport array update 32 bit fd", + "err:%d errno:%d\n", err, errno); + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err != -1 || errno != ENOSPC, + "reuseport array lookup 32 bit fd", + "err:%d errno:%d\n", err, errno); + close(fd); + close(map_fd); +} + static void run_all_tests(void) { test_hashmap(0, NULL); @@ -1170,6 +1428,8 @@ static void run_all_tests(void) test_map_rdonly(); test_map_wronly(); + + test_reuseport_array(); } int main(void) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c new file mode 100644 index 000000000000..75646d9b34aa --- /dev/null +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -0,0 +1,688 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ + +#include <stdlib.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <errno.h> +#include <assert.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/types.h> +#include <linux/if_ether.h> +#include <sys/types.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_rlimit.h" +#include "bpf_util.h" +#include "test_select_reuseport_common.h" + +#define MIN_TCPHDR_LEN 20 +#define UDPHDR_LEN 8 + +#define TCP_SYNCOOKIE_SYSCTL "/proc/sys/net/ipv4/tcp_syncookies" +#define TCP_FO_SYSCTL "/proc/sys/net/ipv4/tcp_fastopen" +#define REUSEPORT_ARRAY_SIZE 32 + +static int result_map, tmp_index_ovr_map, linum_map, data_check_map; +static enum result expected_results[NR_RESULTS]; +static int sk_fds[REUSEPORT_ARRAY_SIZE]; +static int reuseport_array, outer_map; +static int select_by_skb_data_prog; +static int saved_tcp_syncookie; +static struct bpf_object *obj; +static int saved_tcp_fo; +static __u32 index_zero; +static int epfd; + +static union sa46 { + struct sockaddr_in6 v6; + struct sockaddr_in v4; + sa_family_t family; +} srv_sa; + +#define CHECK(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ + printf(format); \ + exit(-1); \ + } \ +}) + +static void create_maps(void) +{ + struct bpf_create_map_attr attr = {}; + + /* Creating reuseport_array */ + attr.name = "reuseport_array"; + attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; + attr.key_size = sizeof(__u32); + attr.value_size = sizeof(__u32); + attr.max_entries = REUSEPORT_ARRAY_SIZE; + + reuseport_array = bpf_create_map_xattr(&attr); + CHECK(reuseport_array == -1, "creating reuseport_array", + "reuseport_array:%d errno:%d\n", reuseport_array, errno); + + /* Creating outer_map */ + attr.name = "outer_map"; + attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS; + attr.key_size = sizeof(__u32); + attr.value_size = sizeof(__u32); + attr.max_entries = 1; + attr.inner_map_fd = reuseport_array; + outer_map = bpf_create_map_xattr(&attr); + CHECK(outer_map == -1, "creating outer_map", + "outer_map:%d errno:%d\n", outer_map, errno); +} + +static void prepare_bpf_obj(void) +{ + struct bpf_program *prog; + struct bpf_map *map; + int err; + struct bpf_object_open_attr attr = { + .file = "test_select_reuseport_kern.o", + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + }; + + obj = bpf_object__open_xattr(&attr); + CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", + "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); + + prog = bpf_program__next(NULL, obj); + CHECK(!prog, "get first bpf_program", "!prog\n"); + bpf_program__set_type(prog, attr.prog_type); + + map = bpf_object__find_map_by_name(obj, "outer_map"); + CHECK(!map, "find outer_map", "!map\n"); + err = bpf_map__reuse_fd(map, outer_map); + CHECK(err, "reuse outer_map", "err:%d\n", err); + + err = bpf_object__load(obj); + CHECK(err, "load bpf_object", "err:%d\n", err); + + select_by_skb_data_prog = bpf_program__fd(prog); + CHECK(select_by_skb_data_prog == -1, "get prog fd", + "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); + + map = bpf_object__find_map_by_name(obj, "result_map"); + CHECK(!map, "find result_map", "!map\n"); + result_map = bpf_map__fd(map); + CHECK(result_map == -1, "get result_map fd", + "result_map:%d\n", result_map); + + map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map"); + CHECK(!map, "find tmp_index_ovr_map", "!map\n"); + tmp_index_ovr_map = bpf_map__fd(map); + CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", + "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); + + map = bpf_object__find_map_by_name(obj, "linum_map"); + CHECK(!map, "find linum_map", "!map\n"); + linum_map = bpf_map__fd(map); + CHECK(linum_map == -1, "get linum_map fd", + "linum_map:%d\n", linum_map); + + map = bpf_object__find_map_by_name(obj, "data_check_map"); + CHECK(!map, "find data_check_map", "!map\n"); + data_check_map = bpf_map__fd(map); + CHECK(data_check_map == -1, "get data_check_map fd", + "data_check_map:%d\n", data_check_map); +} + +static void sa46_init_loopback(union sa46 *sa, sa_family_t family) +{ + memset(sa, 0, sizeof(*sa)); + sa->family = family; + if (sa->family == AF_INET6) + sa->v6.sin6_addr = in6addr_loopback; + else + sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); +} + +static void sa46_init_inany(union sa46 *sa, sa_family_t family) +{ + memset(sa, 0, sizeof(*sa)); + sa->family = family; + if (sa->family == AF_INET6) + sa->v6.sin6_addr = in6addr_any; + else + sa->v4.sin_addr.s_addr = INADDR_ANY; +} + +static int read_int_sysctl(const char *sysctl) +{ + char buf[16]; + int fd, ret; + + fd = open(sysctl, 0); + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", + sysctl, fd, errno); + + ret = read(fd, buf, sizeof(buf)); + CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n", + sysctl, ret, errno); + close(fd); + + return atoi(buf); +} + +static void write_int_sysctl(const char *sysctl, int v) +{ + int fd, ret, size; + char buf[16]; + + fd = open(sysctl, O_RDWR); + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", + sysctl, fd, errno); + + size = snprintf(buf, sizeof(buf), "%d", v); + ret = write(fd, buf, size); + CHECK(ret != size, "write(sysctl)", + "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno); + close(fd); +} + +static void restore_sysctls(void) +{ + write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); +} + +static void enable_fastopen(void) +{ + int fo; + + fo = read_int_sysctl(TCP_FO_SYSCTL); + write_int_sysctl(TCP_FO_SYSCTL, fo | 7); +} + +static void enable_syncookie(void) +{ + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2); +} + +static void disable_syncookie(void) +{ + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); +} + +static __u32 get_linum(void) +{ + __u32 linum; + int err; + + err = bpf_map_lookup_elem(linum_map, &index_zero, &linum); + CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", + err, errno); + + return linum; +} + +static void check_data(int type, sa_family_t family, const struct cmd *cmd, + int cli_fd) +{ + struct data_check expected = {}, result; + union sa46 cli_sa; + socklen_t addrlen; + int err; + + addrlen = sizeof(cli_sa); + err = getsockname(cli_fd, (struct sockaddr *)&cli_sa, + &addrlen); + CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", + err, errno); + + err = bpf_map_lookup_elem(data_check_map, &index_zero, &result); + CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", + err, errno); + + if (type == SOCK_STREAM) { + expected.len = MIN_TCPHDR_LEN; + expected.ip_protocol = IPPROTO_TCP; + } else { + expected.len = UDPHDR_LEN; + expected.ip_protocol = IPPROTO_UDP; + } + + if (family == AF_INET6) { + expected.eth_protocol = htons(ETH_P_IPV6); + expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] && + !srv_sa.v6.sin6_addr.s6_addr32[2] && + !srv_sa.v6.sin6_addr.s6_addr32[1] && + !srv_sa.v6.sin6_addr.s6_addr32[0]; + + memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32, + sizeof(cli_sa.v6.sin6_addr)); + memcpy(&expected.skb_addrs[4], &in6addr_loopback, + sizeof(in6addr_loopback)); + expected.skb_ports[0] = cli_sa.v6.sin6_port; + expected.skb_ports[1] = srv_sa.v6.sin6_port; + } else { + expected.eth_protocol = htons(ETH_P_IP); + expected.bind_inany = !srv_sa.v4.sin_addr.s_addr; + + expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr; + expected.skb_addrs[1] = htonl(INADDR_LOOPBACK); + expected.skb_ports[0] = cli_sa.v4.sin_port; + expected.skb_ports[1] = srv_sa.v4.sin_port; + } + + if (memcmp(&result, &expected, offsetof(struct data_check, + equal_check_end))) { + printf("unexpected data_check\n"); + printf(" result: (0x%x, %u, %u)\n", + result.eth_protocol, result.ip_protocol, + result.bind_inany); + printf("expected: (0x%x, %u, %u)\n", + expected.eth_protocol, expected.ip_protocol, + expected.bind_inany); + CHECK(1, "data_check result != expected", + "bpf_prog_linum:%u\n", get_linum()); + } + + CHECK(!result.hash, "data_check result.hash empty", + "result.hash:%u", result.hash); + + expected.len += cmd ? sizeof(*cmd) : 0; + if (type == SOCK_STREAM) + CHECK(expected.len > result.len, "expected.len > result.len", + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", + expected.len, result.len, get_linum()); + else + CHECK(expected.len != result.len, "expected.len != result.len", + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", + expected.len, result.len, get_linum()); +} + +static void check_results(void) +{ + __u32 results[NR_RESULTS]; + __u32 i, broken = 0; + int err; + + for (i = 0; i < NR_RESULTS; i++) { + err = bpf_map_lookup_elem(result_map, &i, &results[i]); + CHECK(err == -1, "lookup_elem(result_map)", + "i:%u err:%d errno:%d\n", i, err, errno); + } + + for (i = 0; i < NR_RESULTS; i++) { + if (results[i] != expected_results[i]) { + broken = i; + break; + } + } + + if (i == NR_RESULTS) + return; + + printf("unexpected result\n"); + printf(" result: ["); + printf("%u", results[0]); + for (i = 1; i < NR_RESULTS; i++) + printf(", %u", results[i]); + printf("]\n"); + + printf("expected: ["); + printf("%u", expected_results[0]); + for (i = 1; i < NR_RESULTS; i++) + printf(", %u", expected_results[i]); + printf("]\n"); + + CHECK(expected_results[broken] != results[broken], + "unexpected result", + "expected_results[%u] != results[%u] bpf_prog_linum:%u\n", + broken, broken, get_linum()); +} + +static int send_data(int type, sa_family_t family, void *data, size_t len, + enum result expected) +{ + union sa46 cli_sa; + int fd, err; + + fd = socket(family, type, 0); + CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); + + sa46_init_loopback(&cli_sa, family); + err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); + CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); + + err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, + sizeof(srv_sa)); + CHECK(err != len && expected >= PASS, + "sendto()", "family:%u err:%d errno:%d expected:%d\n", + family, err, errno, expected); + + return fd; +} + +static void do_test(int type, sa_family_t family, struct cmd *cmd, + enum result expected) +{ + int nev, srv_fd, cli_fd; + struct epoll_event ev; + struct cmd rcv_cmd; + ssize_t nread; + + cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0, + expected); + nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0); + CHECK((nev <= 0 && expected >= PASS) || + (nev > 0 && expected < PASS), + "nev <> expected", + "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n", + nev, expected, type, family, + cmd ? cmd->reuseport_index : -1, + cmd ? cmd->pass_on_failure : -1); + check_results(); + check_data(type, family, cmd, cli_fd); + + if (expected < PASS) + return; + + CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT && + cmd->reuseport_index != ev.data.u32, + "check cmd->reuseport_index", + "cmd:(%u, %u) ev.data.u32:%u\n", + cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32); + + srv_fd = sk_fds[ev.data.u32]; + if (type == SOCK_STREAM) { + int new_fd = accept(srv_fd, NULL, 0); + + CHECK(new_fd == -1, "accept(srv_fd)", + "ev.data.u32:%u new_fd:%d errno:%d\n", + ev.data.u32, new_fd, errno); + + nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); + CHECK(nread != sizeof(rcv_cmd), + "recv(new_fd)", + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", + ev.data.u32, nread, sizeof(rcv_cmd), errno); + + close(new_fd); + } else { + nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); + CHECK(nread != sizeof(rcv_cmd), + "recv(sk_fds)", + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", + ev.data.u32, nread, sizeof(rcv_cmd), errno); + } + + close(cli_fd); +} + +static void test_err_inner_map(int type, sa_family_t family) +{ + struct cmd cmd = { + .reuseport_index = 0, + .pass_on_failure = 0, + }; + + printf("%s: ", __func__); + expected_results[DROP_ERR_INNER_MAP]++; + do_test(type, family, &cmd, DROP_ERR_INNER_MAP); + printf("OK\n"); +} + +static void test_err_skb_data(int type, sa_family_t family) +{ + printf("%s: ", __func__); + expected_results[DROP_ERR_SKB_DATA]++; + do_test(type, family, NULL, DROP_ERR_SKB_DATA); + printf("OK\n"); +} + +static void test_err_sk_select_port(int type, sa_family_t family) +{ + struct cmd cmd = { + .reuseport_index = REUSEPORT_ARRAY_SIZE, + .pass_on_failure = 0, + }; + + printf("%s: ", __func__); + expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++; + do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT); + printf("OK\n"); +} + +static void test_pass(int type, sa_family_t family) +{ + struct cmd cmd; + int i; + + printf("%s: ", __func__); + cmd.pass_on_failure = 0; + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { + expected_results[PASS]++; + cmd.reuseport_index = i; + do_test(type, family, &cmd, PASS); + } + printf("OK\n"); +} + +static void test_syncookie(int type, sa_family_t family) +{ + int err, tmp_index = 1; + struct cmd cmd = { + .reuseport_index = 0, + .pass_on_failure = 0, + }; + + if (type != SOCK_STREAM) + return; + + printf("%s: ", __func__); + /* + * +1 for TCP-SYN and + * +1 for the TCP-ACK (ack the syncookie) + */ + expected_results[PASS] += 2; + enable_syncookie(); + /* + * Simulate TCP-SYN and TCP-ACK are handled by two different sk: + * TCP-SYN: select sk_fds[tmp_index = 1] tmp_index is from the + * tmp_index_ovr_map + * TCP-ACK: select sk_fds[reuseport_index = 0] reuseport_index + * is from the cmd.reuseport_index + */ + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, + &tmp_index, BPF_ANY); + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", + "err:%d errno:%d\n", err, errno); + do_test(type, family, &cmd, PASS); + err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero, + &tmp_index); + CHECK(err == -1 || tmp_index != -1, + "lookup_elem(tmp_index_ovr_map)", + "err:%d errno:%d tmp_index:%d\n", + err, errno, tmp_index); + disable_syncookie(); + printf("OK\n"); +} + +static void test_pass_on_err(int type, sa_family_t family) +{ + struct cmd cmd = { + .reuseport_index = REUSEPORT_ARRAY_SIZE, + .pass_on_failure = 1, + }; + + printf("%s: ", __func__); + expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1; + do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT); + printf("OK\n"); +} + +static void prepare_sk_fds(int type, sa_family_t family, bool inany) +{ + const int first = REUSEPORT_ARRAY_SIZE - 1; + int i, err, optval = 1; + struct epoll_event ev; + socklen_t addrlen; + + if (inany) + sa46_init_inany(&srv_sa, family); + else + sa46_init_loopback(&srv_sa, family); + addrlen = sizeof(srv_sa); + + /* + * The sk_fds[] is filled from the back such that the order + * is exactly opposite to the (struct sock_reuseport *)reuse->socks[]. + */ + for (i = first; i >= 0; i--) { + sk_fds[i] = socket(family, type, 0); + CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n", + i, sk_fds[i], errno); + err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof(optval)); + CHECK(err == -1, "setsockopt(SO_REUSEPORT)", + "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); + + if (i == first) { + err = setsockopt(sk_fds[i], SOL_SOCKET, + SO_ATTACH_REUSEPORT_EBPF, + &select_by_skb_data_prog, + sizeof(select_by_skb_data_prog)); + CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", + "err:%d errno:%d\n", err, errno); + } + + err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen); + CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); + + if (type == SOCK_STREAM) { + err = listen(sk_fds[i], 10); + CHECK(err == -1, "listen()", + "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); + } + + err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i], + BPF_NOEXIST); + CHECK(err == -1, "update_elem(reuseport_array)", + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); + + if (i == first) { + socklen_t addrlen = sizeof(srv_sa); + + err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa, + &addrlen); + CHECK(err == -1, "getsockname()", + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); + } + } + + epfd = epoll_create(1); + CHECK(epfd == -1, "epoll_create(1)", + "epfd:%d errno:%d\n", epfd, errno); + + ev.events = EPOLLIN; + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { + ev.data.u32 = i; + err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev); + CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i); + } +} + +static void setup_per_test(int type, unsigned short family, bool inany) +{ + int ovr = -1, err; + + prepare_sk_fds(type, family, inany); + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr, + BPF_ANY); + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", + "err:%d errno:%d\n", err, errno); +} + +static void cleanup_per_test(void) +{ + int i, err; + + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) + close(sk_fds[i]); + close(epfd); + + err = bpf_map_delete_elem(outer_map, &index_zero); + CHECK(err == -1, "delete_elem(outer_map)", + "err:%d errno:%d\n", err, errno); +} + +static void cleanup(void) +{ + close(outer_map); + close(reuseport_array); + bpf_object__close(obj); +} + +static void test_all(void) +{ + /* Extra SOCK_STREAM to test bind_inany==true */ + const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; + const char * const type_strings[] = { "TCP", "UDP", "TCP" }; + const char * const family_strings[] = { "IPv6", "IPv4" }; + const unsigned short families[] = { AF_INET6, AF_INET }; + const bool bind_inany[] = { false, false, true }; + int t, f, err; + + for (f = 0; f < ARRAY_SIZE(families); f++) { + unsigned short family = families[f]; + + for (t = 0; t < ARRAY_SIZE(types); t++) { + bool inany = bind_inany[t]; + int type = types[t]; + + printf("######## %s/%s %s ########\n", + family_strings[f], type_strings[t], + inany ? " INANY " : "LOOPBACK"); + + setup_per_test(type, family, inany); + + test_err_inner_map(type, family); + + /* Install reuseport_array to the outer_map */ + err = bpf_map_update_elem(outer_map, &index_zero, + &reuseport_array, BPF_ANY); + CHECK(err == -1, "update_elem(outer_map)", + "err:%d errno:%d\n", err, errno); + + test_err_skb_data(type, family); + test_err_sk_select_port(type, family); + test_pass(type, family); + test_syncookie(type, family); + test_pass_on_err(type, family); + + cleanup_per_test(); + printf("\n"); + } + } +} + +int main(int argc, const char **argv) +{ + create_maps(); + prepare_bpf_obj(); + saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); + saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); + enable_fastopen(); + disable_syncookie(); + atexit(restore_sysctls); + + test_all(); + + cleanup(); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_select_reuseport_common.h b/tools/testing/selftests/bpf/test_select_reuseport_common.h new file mode 100644 index 000000000000..08eb2a9f145f --- /dev/null +++ b/tools/testing/selftests/bpf/test_select_reuseport_common.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#ifndef __TEST_SELECT_REUSEPORT_COMMON_H +#define __TEST_SELECT_REUSEPORT_COMMON_H + +#include <linux/types.h> + +enum result { + DROP_ERR_INNER_MAP, + DROP_ERR_SKB_DATA, + DROP_ERR_SK_SELECT_REUSEPORT, + DROP_MISC, + PASS, + PASS_ERR_SK_SELECT_REUSEPORT, + NR_RESULTS, +}; + +struct cmd { + __u32 reuseport_index; + __u32 pass_on_failure; +}; + +struct data_check { + __u32 ip_protocol; + __u32 skb_addrs[8]; + __u16 skb_ports[2]; + __u16 eth_protocol; + __u8 bind_inany; + __u8 equal_check_end[0]; + + __u32 len; + __u32 hash; +}; + +#endif diff --git a/tools/testing/selftests/bpf/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/test_select_reuseport_kern.c new file mode 100644 index 000000000000..5b54ec637ada --- /dev/null +++ b/tools/testing/selftests/bpf/test_select_reuseport_kern.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ + +#include <stdlib.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/bpf.h> +#include <linux/types.h> +#include <linux/if_ether.h> + +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "test_select_reuseport_common.h" + +int _version SEC("version") = 1; + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +struct bpf_map_def SEC("maps") outer_map = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") result_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = NR_RESULTS, +}; + +struct bpf_map_def SEC("maps") tmp_index_ovr_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") linum_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") data_check_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct data_check), + .max_entries = 1, +}; + +#define GOTO_DONE(_result) ({ \ + result = (_result); \ + linum = __LINE__; \ + goto done; \ +}) + +SEC("select_by_skb_data") +int _select_by_skb_data(struct sk_reuseport_md *reuse_md) +{ + __u32 linum, index = 0, flags = 0, index_zero = 0; + __u32 *result_cnt, *linum_value; + struct data_check data_check = {}; + struct cmd *cmd, cmd_copy; + void *data, *data_end; + void *reuseport_array; + enum result result; + int *index_ovr; + int err; + + data = reuse_md->data; + data_end = reuse_md->data_end; + data_check.len = reuse_md->len; + data_check.eth_protocol = reuse_md->eth_protocol; + data_check.ip_protocol = reuse_md->ip_protocol; + data_check.hash = reuse_md->hash; + data_check.bind_inany = reuse_md->bind_inany; + if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) { + if (bpf_skb_load_bytes_relative(reuse_md, + offsetof(struct iphdr, saddr), + data_check.skb_addrs, 8, + BPF_HDR_START_NET)) + GOTO_DONE(DROP_MISC); + } else { + if (bpf_skb_load_bytes_relative(reuse_md, + offsetof(struct ipv6hdr, saddr), + data_check.skb_addrs, 32, + BPF_HDR_START_NET)) + GOTO_DONE(DROP_MISC); + } + + /* + * The ip_protocol could be a compile time decision + * if the bpf_prog.o is dedicated to either TCP or + * UDP. + * + * Otherwise, reuse_md->ip_protocol or + * the protocol field in the iphdr can be used. + */ + if (data_check.ip_protocol == IPPROTO_TCP) { + struct tcphdr *th = data; + + if (th + 1 > data_end) + GOTO_DONE(DROP_MISC); + + data_check.skb_ports[0] = th->source; + data_check.skb_ports[1] = th->dest; + + if ((th->doff << 2) + sizeof(*cmd) > data_check.len) + GOTO_DONE(DROP_ERR_SKB_DATA); + if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy, + sizeof(cmd_copy))) + GOTO_DONE(DROP_MISC); + cmd = &cmd_copy; + } else if (data_check.ip_protocol == IPPROTO_UDP) { + struct udphdr *uh = data; + + if (uh + 1 > data_end) + GOTO_DONE(DROP_MISC); + + data_check.skb_ports[0] = uh->source; + data_check.skb_ports[1] = uh->dest; + + if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len) + GOTO_DONE(DROP_ERR_SKB_DATA); + if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) { + if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr), + &cmd_copy, sizeof(cmd_copy))) + GOTO_DONE(DROP_MISC); + cmd = &cmd_copy; + } else { + cmd = data + sizeof(struct udphdr); + } + } else { + GOTO_DONE(DROP_MISC); + } + + reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero); + if (!reuseport_array) + GOTO_DONE(DROP_ERR_INNER_MAP); + + index = cmd->reuseport_index; + index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero); + if (!index_ovr) + GOTO_DONE(DROP_MISC); + + if (*index_ovr != -1) { + index = *index_ovr; + *index_ovr = -1; + } + err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index, + flags); + if (!err) + GOTO_DONE(PASS); + + if (cmd->pass_on_failure) + GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT); + else + GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT); + +done: + result_cnt = bpf_map_lookup_elem(&result_map, &result); + if (!result_cnt) + return SK_DROP; + + bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY); + bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY); + + (*result_cnt)++; + return result < PASS ? SK_DROP : SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh new file mode 100755 index 000000000000..42544a969abc --- /dev/null +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh @@ -0,0 +1,62 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Facebook + +set -eu + +wait_for_ip() +{ + local _i + echo -n "Wait for testing link-local IP to become available " + for _i in $(seq ${MAX_PING_TRIES}); do + echo -n "." + if ping -6 -q -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then + echo " OK" + return + fi + sleep 1 + done + echo 1>&2 "ERROR: Timeout waiting for test IP to become available." + exit 1 +} + +setup() +{ + # Create testing interfaces not to interfere with current environment. + ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} + ip link set ${TEST_IF} up + ip link set ${TEST_IF_PEER} up + + wait_for_ip + + tc qdisc add dev ${TEST_IF} clsact + tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \ + sec ${BPF_PROG_SECTION} da + + BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ + awk '/ id / {sub(/.* id /, "", $0); print($1)}') +} + +cleanup() +{ + ip link del ${TEST_IF} 2>/dev/null || : + ip link del ${TEST_IF_PEER} 2>/dev/null || : +} + +main() +{ + trap cleanup EXIT 2 3 6 15 + setup + ${PROG} ${TEST_IF} ${BPF_PROG_ID} +} + +DIR=$(dirname $0) +TEST_IF="test_cgid_1" +TEST_IF_PEER="test_cgid_2" +MAX_PING_TRIES=5 +BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o" +BPF_PROG_SECTION="cgroup_id_logger" +BPF_PROG_ID=0 +PROG="${DIR}/test_skb_cgroup_id_user" + +main diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c new file mode 100644 index 000000000000..68cf9829f5a7 --- /dev/null +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <linux/bpf.h> +#include <linux/pkt_cls.h> + +#include <string.h> + +#include "bpf_helpers.h" + +#define NUM_CGROUP_LEVELS 4 + +struct bpf_map_def SEC("maps") cgroup_ids = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u64), + .max_entries = NUM_CGROUP_LEVELS, +}; + +static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) +{ + __u64 id; + + /* [1] &level passed to external function that may change it, it's + * incompatible with loop unroll. + */ + id = bpf_skb_ancestor_cgroup_id(skb, level); + bpf_map_update_elem(&cgroup_ids, &level, &id, 0); +} + +SEC("cgroup_id_logger") +int log_cgroup_id(struct __sk_buff *skb) +{ + /* Loop unroll can't be used here due to [1]. Unrolling manually. + * Number of calls should be in sync with NUM_CGROUP_LEVELS. + */ + log_nth_level(skb, 0); + log_nth_level(skb, 1); + log_nth_level(skb, 2); + log_nth_level(skb, 3); + + return TC_ACT_OK; +} + +int _version SEC("version") = 1; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c new file mode 100644 index 000000000000..c121cc59f314 --- /dev/null +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <sys/socket.h> +#include <sys/types.h> + + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" + +#define CGROUP_PATH "/skb_cgroup_test" +#define NUM_CGROUP_LEVELS 4 + +/* RFC 4291, Section 2.7.1 */ +#define LINKLOCAL_MULTICAST "ff02::1" + +static int mk_dst_addr(const char *ip, const char *iface, + struct sockaddr_in6 *dst) +{ + memset(dst, 0, sizeof(*dst)); + + dst->sin6_family = AF_INET6; + dst->sin6_port = htons(1025); + + if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) { + log_err("Invalid IPv6: %s", ip); + return -1; + } + + dst->sin6_scope_id = if_nametoindex(iface); + if (!dst->sin6_scope_id) { + log_err("Failed to get index of iface: %s", iface); + return -1; + } + + return 0; +} + +static int send_packet(const char *iface) +{ + struct sockaddr_in6 dst; + char msg[] = "msg"; + int err = 0; + int fd = -1; + + if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst)) + goto err; + + fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (fd == -1) { + log_err("Failed to create UDP socket"); + goto err; + } + + if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst, + sizeof(dst)) == -1) { + log_err("Failed to send datagram"); + goto err; + } + + goto out; +err: + err = -1; +out: + if (fd >= 0) + close(fd); + return err; +} + +int get_map_fd_by_prog_id(int prog_id) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 map_ids[1]; + int prog_fd = -1; + int map_fd = -1; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + log_err("Failed to get fd by prog id %d", prog_id); + goto err; + } + + info.nr_map_ids = 1; + info.map_ids = (__u64) (unsigned long) map_ids; + + if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) { + log_err("Failed to get info by prog fd %d", prog_fd); + goto err; + } + + if (!info.nr_map_ids) { + log_err("No maps found for prog fd %d", prog_fd); + goto err; + } + + map_fd = bpf_map_get_fd_by_id(map_ids[0]); + if (map_fd < 0) + log_err("Failed to get fd by map id %d", map_ids[0]); +err: + if (prog_fd >= 0) + close(prog_fd); + return map_fd; +} + +int check_ancestor_cgroup_ids(int prog_id) +{ + __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS]; + __u32 level; + int err = 0; + int map_fd; + + expected_ids[0] = 0x100000001; /* root cgroup */ + expected_ids[1] = get_cgroup_id(""); + expected_ids[2] = get_cgroup_id(CGROUP_PATH); + expected_ids[3] = 0; /* non-existent cgroup */ + + map_fd = get_map_fd_by_prog_id(prog_id); + if (map_fd < 0) + goto err; + + for (level = 0; level < NUM_CGROUP_LEVELS; ++level) { + if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) { + log_err("Failed to lookup key %d", level); + goto err; + } + if (actual_ids[level] != expected_ids[level]) { + log_err("%llx (actual) != %llx (expected), level: %u\n", + actual_ids[level], expected_ids[level], level); + goto err; + } + } + + goto out; +err: + err = -1; +out: + if (map_fd >= 0) + close(map_fd); + return err; +} + +int main(int argc, char **argv) +{ + int cgfd = -1; + int err = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]); + exit(EXIT_FAILURE); + } + + if (setup_cgroup_environment()) + goto err; + + cgfd = create_and_get_cgroup(CGROUP_PATH); + if (!cgfd) + goto err; + + if (join_cgroup(CGROUP_PATH)) + goto err; + + if (send_packet(argv[1])) + goto err; + + if (check_ancestor_cgroup_ids(atoi(argv[2]))) + goto err; + + goto out; +err: + err = -1; +out: + close(cgfd); + cleanup_cgroup_environment(); + printf("[%s]\n", err ? "FAIL" : "PASS"); + return err; +} diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index f4d99fabc56d..b8ebe2f58074 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -14,10 +14,7 @@ #include "cgroup_helpers.h" #include "bpf_rlimit.h" - -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif +#include "bpf_util.h" #define CG_PATH "/foo" #define MAX_INSNS 512 diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 2e45c92d1111..aeeb76a54d63 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -20,15 +20,12 @@ #include "cgroup_helpers.h" #include "bpf_rlimit.h" +#include "bpf_util.h" #ifndef ENOTSUPP # define ENOTSUPP 524 #endif -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - #define CG_PATH "/foo" #define CONNECT4_PROG_PATH "./connect4_prog.o" #define CONNECT6_PROG_PATH "./connect6_prog.o" diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 452cf5c6c784..67c412d19c09 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -42,12 +42,9 @@ #endif #include "bpf_rlimit.h" #include "bpf_rand.h" +#include "bpf_util.h" #include "../../../include/linux/filter.h" -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - #define MAX_INSNS BPF_MAXINSNS #define MAX_FIXUPS 8 #define MAX_NR_MAPS 8 |