From 22b436c9b5682e877d34425d05576db74a8647e1 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 2 Mar 2020 13:50:01 +0200 Subject: virtio-net: Introduce extended RSC feature VIRTIO_NET_F_RSC_EXT feature bit indicates that the device is able to provide extended RSC information. When the feature is negotiatede and 'gso_type' field in received packet is not GSO_NONE, the device reports number of coalesced packets in 'csum_start' field and number of duplicated acks in 'csum_offset' field and sets VIRTIO_NET_HDR_F_RSC_INFO in 'flags' field. Signed-off-by: Yuri Benditovich Link: https://lore.kernel.org/r/20200302115003.14877-2-yuri.benditovich@daynix.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index a3715a3224c1..6466c5979a93 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,7 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device * with the same MAC. */ @@ -104,6 +105,7 @@ struct virtio_net_config { struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ +#define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */ __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ @@ -113,8 +115,26 @@ struct virtio_net_hdr_v1 { __u8 gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ - __virtio16 csum_start; /* Position to start checksumming from */ - __virtio16 csum_offset; /* Offset after that to place checksum */ + union { + struct { + __virtio16 csum_start; + __virtio16 csum_offset; + }; + /* Checksum calculation */ + struct { + /* Position to start checksumming from */ + __virtio16 start; + /* Offset after that to place checksum */ + __virtio16 offset; + } csum; + /* Receive Segment Coalescing */ + struct { + /* Number of coalesced segments */ + __le16 segments; + /* Number of duplicated acks */ + __le16 dup_acks; + } rsc; + }; __virtio16 num_buffers; /* Number of merged rx buffers */ }; -- cgit v1.2.3 From fd58bf674564f1731ca8a61a5150d40383f3df60 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 2 Mar 2020 13:50:02 +0200 Subject: virtio-net: Introduce RSS receive steering feature RSS (Receive-side scaling) defines hash calculation rules and decision on receive virtqueue according to the calculated hash, provided mask to apply and provided indirection table containing indices of receive virqueues. The driver sends the control command to enable multiqueue and provide parameters for receive steering. Signed-off-by: Yuri Benditovich Link: https://lore.kernel.org/r/20200302115003.14877-3-yuri.benditovich@daynix.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 42 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 6466c5979a93..aec6fac3666a 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,7 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device * with the same MAC. @@ -70,6 +71,17 @@ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ #define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ +/* supported/enabled hash types */ +#define VIRTIO_NET_RSS_HASH_TYPE_IPv4 (1 << 0) +#define VIRTIO_NET_RSS_HASH_TYPE_TCPv4 (1 << 1) +#define VIRTIO_NET_RSS_HASH_TYPE_UDPv4 (1 << 2) +#define VIRTIO_NET_RSS_HASH_TYPE_IPv6 (1 << 3) +#define VIRTIO_NET_RSS_HASH_TYPE_TCPv6 (1 << 4) +#define VIRTIO_NET_RSS_HASH_TYPE_UDPv6 (1 << 5) +#define VIRTIO_NET_RSS_HASH_TYPE_IP_EX (1 << 6) +#define VIRTIO_NET_RSS_HASH_TYPE_TCP_EX (1 << 7) +#define VIRTIO_NET_RSS_HASH_TYPE_UDP_EX (1 << 8) + struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[ETH_ALEN]; @@ -93,6 +105,12 @@ struct virtio_net_config { * Any other value stands for unknown. */ __u8 duplex; + /* maximum size of RSS key */ + __u8 rss_max_key_size; + /* maximum number of indirection table entries */ + __le16 rss_max_indirection_table_length; + /* bitmask of supported VIRTIO_NET_RSS_HASH_ types */ + __le32 supported_hash_types; } __attribute__((packed)); /* @@ -248,7 +266,9 @@ struct virtio_net_ctrl_mac { /* * Control Receive Flow Steering - * + */ +#define VIRTIO_NET_CTRL_MQ 4 +/* * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET * enables Receive Flow Steering, specifying the number of the transmit and * receive queues that will be used. After the command is consumed and acked by @@ -261,11 +281,29 @@ struct virtio_net_ctrl_mq { __virtio16 virtqueue_pairs; }; -#define VIRTIO_NET_CTRL_MQ 4 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET 0 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN 1 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX 0x8000 +/* + * The command VIRTIO_NET_CTRL_MQ_RSS_CONFIG has the same effect as + * VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET does and additionally configures + * the receive steering to use a hash calculated for incoming packet + * to decide on receive virtqueue to place the packet. The command + * also provides parameters to calculate a hash and receive virtqueue. + */ +struct virtio_net_rss_config { + __le32 hash_types; + __le16 indirection_table_mask; + __le16 unclassified_queue; + __le16 indirection_table[1/* + indirection_table_mask */]; + __le16 max_tx_vq; + __u8 hash_key_length; + __u8 hash_key_data[/* hash_key_length */]; +}; + + #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 + /* * Control network offloads * -- cgit v1.2.3 From 3024e20958ee9e1554951df4d26aaf9f5cb7c210 Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Mon, 2 Mar 2020 13:50:03 +0200 Subject: virtio-net: Introduce hash report feature The feature VIRTIO_NET_F_HASH_REPORT extends the layout of the packet and requests the device to calculate hash on incoming packets and report it in the packet header. Signed-off-by: Yuri Benditovich Link: https://lore.kernel.org/r/20200302115003.14877-4-yuri.benditovich@daynix.com Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_net.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index aec6fac3666a..19d23e5baa4e 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -57,6 +57,7 @@ * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ +#define VIRTIO_NET_F_HASH_REPORT 57 /* Supports hash report */ #define VIRTIO_NET_F_RSS 60 /* Supports RSS RX steering */ #define VIRTIO_NET_F_RSC_EXT 61 /* extended coalescing info */ #define VIRTIO_NET_F_STANDBY 62 /* Act as standby for another device @@ -156,6 +157,23 @@ struct virtio_net_hdr_v1 { __virtio16 num_buffers; /* Number of merged rx buffers */ }; +struct virtio_net_hdr_v1_hash { + struct virtio_net_hdr_v1 hdr; + __le32 hash_value; +#define VIRTIO_NET_HASH_REPORT_NONE 0 +#define VIRTIO_NET_HASH_REPORT_IPv4 1 +#define VIRTIO_NET_HASH_REPORT_TCPv4 2 +#define VIRTIO_NET_HASH_REPORT_UDPv4 3 +#define VIRTIO_NET_HASH_REPORT_IPv6 4 +#define VIRTIO_NET_HASH_REPORT_TCPv6 5 +#define VIRTIO_NET_HASH_REPORT_UDPv6 6 +#define VIRTIO_NET_HASH_REPORT_IPv6_EX 7 +#define VIRTIO_NET_HASH_REPORT_TCPv6_EX 8 +#define VIRTIO_NET_HASH_REPORT_UDPv6_EX 9 + __le16 hash_report; + __le16 padding; +}; + #ifndef VIRTIO_NET_NO_LEGACY /* This header comes first in the scatter-gather list. * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must @@ -304,6 +322,24 @@ struct virtio_net_rss_config { #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 +/* + * The command VIRTIO_NET_CTRL_MQ_HASH_CONFIG requests the device + * to include in the virtio header of the packet the value of the + * calculated hash and the report type of hash. It also provides + * parameters for hash calculation. The command requires feature + * VIRTIO_NET_F_HASH_REPORT to be negotiated to extend the + * layout of virtio header as defined in virtio_net_hdr_v1_hash. + */ +struct virtio_net_hash_config { + __le32 hash_types; + /* for compatibility with virtio_net_rss_config */ + __le16 reserved[4]; + __u8 hash_key_length; + __u8 hash_key_data[/* hash_key_length */]; +}; + + #define VIRTIO_NET_CTRL_MQ_HASH_CONFIG 2 + /* * Control network offloads * -- cgit v1.2.3 From 0bbe30668d89ec8a309f28ced6d092c90fb23e8c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 26 Mar 2020 22:01:19 +0800 Subject: vhost: factor out IOTLB This patch factors out IOTLB into a dedicated module in order to be reused by other modules like vringh. User may choose to enable the automatic retiring by specifying VHOST_IOTLB_FLAG_RETIRE flag to fit for the case of vhost device IOTLB implementation. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200326140125.19794-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 1 + drivers/vhost/Kconfig | 6 ++ drivers/vhost/Makefile | 3 + drivers/vhost/iotlb.c | 177 +++++++++++++++++++++++++++++++++++ drivers/vhost/net.c | 2 +- drivers/vhost/vhost.c | 221 ++++++++++++++------------------------------ drivers/vhost/vhost.h | 39 +++----- include/linux/vhost_iotlb.h | 47 ++++++++++ 8 files changed, 315 insertions(+), 181 deletions(-) create mode 100644 drivers/vhost/iotlb.c create mode 100644 include/linux/vhost_iotlb.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index cc1d18cb5d18..19363ed5e723 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17766,6 +17766,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git S: Maintained F: drivers/vhost/ F: include/uapi/linux/vhost.h +F: include/linux/vhost_iotlb.h VIRTIO INPUT DRIVER M: Gerd Hoffmann diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index e775beddc36a..37400a1655b4 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,4 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only +config VHOST_IOTLB + tristate + help + Generic IOTLB implementation for vhost and vringh. + config VHOST_RING tristate help @@ -67,4 +72,5 @@ config VHOST_CROSS_ENDIAN_LEGACY adds some overhead, it is disabled by default. If unsure, say "N". + endif diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 6c6df24f770c..fb831002bcf0 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -11,3 +11,6 @@ vhost_vsock-y := vsock.o obj-$(CONFIG_VHOST_RING) += vringh.o obj-$(CONFIG_VHOST) += vhost.o + +obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o +vhost_iotlb-y := iotlb.o diff --git a/drivers/vhost/iotlb.c b/drivers/vhost/iotlb.c new file mode 100644 index 000000000000..1f0ca6e44410 --- /dev/null +++ b/drivers/vhost/iotlb.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2020 Red Hat, Inc. + * Author: Jason Wang + * + * IOTLB implementation for vhost. + */ +#include +#include +#include + +#define MOD_VERSION "0.1" +#define MOD_DESC "VHOST IOTLB" +#define MOD_AUTHOR "Jason Wang " +#define MOD_LICENSE "GPL v2" + +#define START(map) ((map)->start) +#define LAST(map) ((map)->last) + +INTERVAL_TREE_DEFINE(struct vhost_iotlb_map, + rb, __u64, __subtree_last, + START, LAST, static inline, vhost_iotlb_itree); + +/** + * vhost_iotlb_map_free - remove a map node and free it + * @iotlb: the IOTLB + * @map: the map that want to be remove and freed + */ +void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, + struct vhost_iotlb_map *map) +{ + vhost_iotlb_itree_remove(map, &iotlb->root); + list_del(&map->link); + kfree(map); + iotlb->nmaps--; +} +EXPORT_SYMBOL_GPL(vhost_iotlb_map_free); + +/** + * vhost_iotlb_add_range - add a new range to vhost IOTLB + * @iotlb: the IOTLB + * @start: start of the IOVA range + * @last: last of IOVA range + * @addr: the address that is mapped to @start + * @perm: access permission of this range + * + * Returns an error last is smaller than start or memory allocation + * fails + */ +int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, + u64 start, u64 last, + u64 addr, unsigned int perm) +{ + struct vhost_iotlb_map *map; + + if (last < start) + return -EFAULT; + + if (iotlb->limit && + iotlb->nmaps == iotlb->limit && + iotlb->flags & VHOST_IOTLB_FLAG_RETIRE) { + map = list_first_entry(&iotlb->list, typeof(*map), link); + vhost_iotlb_map_free(iotlb, map); + } + + map = kmalloc(sizeof(*map), GFP_ATOMIC); + if (!map) + return -ENOMEM; + + map->start = start; + map->size = last - start + 1; + map->last = last; + map->addr = addr; + map->perm = perm; + + iotlb->nmaps++; + vhost_iotlb_itree_insert(map, &iotlb->root); + + INIT_LIST_HEAD(&map->link); + list_add_tail(&map->link, &iotlb->list); + + return 0; +} +EXPORT_SYMBOL_GPL(vhost_iotlb_add_range); + +/** + * vring_iotlb_del_range - delete overlapped ranges from vhost IOTLB + * @iotlb: the IOTLB + * @start: start of the IOVA range + * @last: last of IOVA range + */ +void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last) +{ + struct vhost_iotlb_map *map; + + while ((map = vhost_iotlb_itree_iter_first(&iotlb->root, + start, last))) + vhost_iotlb_map_free(iotlb, map); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_del_range); + +/** + * vhost_iotlb_alloc - add a new vhost IOTLB + * @limit: maximum number of IOTLB entries + * @flags: VHOST_IOTLB_FLAG_XXX + * + * Returns an error is memory allocation fails + */ +struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags) +{ + struct vhost_iotlb *iotlb = kzalloc(sizeof(*iotlb), GFP_KERNEL); + + if (!iotlb) + return NULL; + + iotlb->root = RB_ROOT_CACHED; + iotlb->limit = limit; + iotlb->nmaps = 0; + iotlb->flags = flags; + INIT_LIST_HEAD(&iotlb->list); + + return iotlb; +} +EXPORT_SYMBOL_GPL(vhost_iotlb_alloc); + +/** + * vhost_iotlb_reset - reset vhost IOTLB (free all IOTLB entries) + * @iotlb: the IOTLB to be reset + */ +void vhost_iotlb_reset(struct vhost_iotlb *iotlb) +{ + vhost_iotlb_del_range(iotlb, 0ULL, 0ULL - 1); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_reset); + +/** + * vhost_iotlb_free - reset and free vhost IOTLB + * @iotlb: the IOTLB to be freed + */ +void vhost_iotlb_free(struct vhost_iotlb *iotlb) +{ + if (iotlb) { + vhost_iotlb_reset(iotlb); + kfree(iotlb); + } +} +EXPORT_SYMBOL_GPL(vhost_iotlb_free); + +/** + * vhost_iotlb_itree_first - return the first overlapped range + * @iotlb: the IOTLB + * @start: start of IOVA range + * @end: end of IOVA range + */ +struct vhost_iotlb_map * +vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last) +{ + return vhost_iotlb_itree_iter_first(&iotlb->root, start, last); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_itree_first); + +/** + * vhost_iotlb_itree_first - return the next overlapped range + * @iotlb: the IOTLB + * @start: start of IOVA range + * @end: end of IOVA range + */ +struct vhost_iotlb_map * +vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last) +{ + return vhost_iotlb_itree_iter_next(map, start, last); +} +EXPORT_SYMBOL_GPL(vhost_iotlb_itree_next); + +MODULE_VERSION(MOD_VERSION); +MODULE_DESCRIPTION(MOD_DESC); +MODULE_AUTHOR(MOD_AUTHOR); +MODULE_LICENSE(MOD_LICENSE); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 7b1d2dfec7f2..87469d67ede8 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1587,7 +1587,7 @@ static long vhost_net_reset_owner(struct vhost_net *n) struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; - struct vhost_umem *umem; + struct vhost_iotlb *umem; mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8e9e2341e40a..d450e16c5c25 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -50,10 +50,6 @@ enum { #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num]) -INTERVAL_TREE_DEFINE(struct vhost_umem_node, - rb, __u64, __subtree_last, - START, LAST, static inline, vhost_umem_interval_tree); - #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY static void vhost_disable_cross_endian(struct vhost_virtqueue *vq) { @@ -584,21 +580,25 @@ err_mm: } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); -struct vhost_umem *vhost_dev_reset_owner_prepare(void) +static struct vhost_iotlb *iotlb_alloc(void) +{ + return vhost_iotlb_alloc(max_iotlb_entries, + VHOST_IOTLB_FLAG_RETIRE); +} + +struct vhost_iotlb *vhost_dev_reset_owner_prepare(void) { - return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL); + return iotlb_alloc(); } EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); /* Caller should have device mutex */ -void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_umem *umem) +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) { int i; vhost_dev_cleanup(dev); - /* Restore memory to default empty mapping. */ - INIT_LIST_HEAD(&umem->umem_list); dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. @@ -621,28 +621,6 @@ void vhost_dev_stop(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_stop); -static void vhost_umem_free(struct vhost_umem *umem, - struct vhost_umem_node *node) -{ - vhost_umem_interval_tree_remove(node, &umem->umem_tree); - list_del(&node->link); - kfree(node); - umem->numem--; -} - -static void vhost_umem_clean(struct vhost_umem *umem) -{ - struct vhost_umem_node *node, *tmp; - - if (!umem) - return; - - list_for_each_entry_safe(node, tmp, &umem->umem_list, link) - vhost_umem_free(umem, node); - - kvfree(umem); -} - static void vhost_clear_msg(struct vhost_dev *dev) { struct vhost_msg_node *node, *n; @@ -680,9 +658,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) eventfd_ctx_put(dev->log_ctx); dev->log_ctx = NULL; /* No one will access memory at this point */ - vhost_umem_clean(dev->umem); + vhost_iotlb_free(dev->umem); dev->umem = NULL; - vhost_umem_clean(dev->iotlb); + vhost_iotlb_free(dev->iotlb); dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); @@ -718,27 +696,26 @@ static bool vhost_overflow(u64 uaddr, u64 size) } /* Caller should have vq mutex and device mutex. */ -static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, +static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem, int log_all) { - struct vhost_umem_node *node; + struct vhost_iotlb_map *map; if (!umem) return false; - list_for_each_entry(node, &umem->umem_list, link) { - unsigned long a = node->userspace_addr; + list_for_each_entry(map, &umem->list, link) { + unsigned long a = map->addr; - if (vhost_overflow(node->userspace_addr, node->size)) + if (vhost_overflow(map->addr, map->size)) return false; - if (!access_ok((void __user *)a, - node->size)) + if (!access_ok((void __user *)a, map->size)) return false; else if (log_all && !log_access_ok(log_base, - node->start, - node->size)) + map->start, + map->size)) return false; } return true; @@ -748,17 +725,17 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, u64 addr, unsigned int size, int type) { - const struct vhost_umem_node *node = vq->meta_iotlb[type]; + const struct vhost_iotlb_map *map = vq->meta_iotlb[type]; - if (!node) + if (!map) return NULL; - return (void *)(uintptr_t)(node->userspace_addr + addr - node->start); + return (void *)(uintptr_t)(map->addr + addr - map->start); } /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ -static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, +static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem, int log_all) { int i; @@ -1023,47 +1000,6 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq, return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc)); } -static int vhost_new_umem_range(struct vhost_umem *umem, - u64 start, u64 size, u64 end, - u64 userspace_addr, int perm) -{ - struct vhost_umem_node *tmp, *node; - - if (!size) - return -EFAULT; - - node = kmalloc(sizeof(*node), GFP_ATOMIC); - if (!node) - return -ENOMEM; - - if (umem->numem == max_iotlb_entries) { - tmp = list_first_entry(&umem->umem_list, typeof(*tmp), link); - vhost_umem_free(umem, tmp); - } - - node->start = start; - node->size = size; - node->last = end; - node->userspace_addr = userspace_addr; - node->perm = perm; - INIT_LIST_HEAD(&node->link); - list_add_tail(&node->link, &umem->umem_list); - vhost_umem_interval_tree_insert(node, &umem->umem_tree); - umem->numem++; - - return 0; -} - -static void vhost_del_umem_range(struct vhost_umem *umem, - u64 start, u64 end) -{ - struct vhost_umem_node *node; - - while ((node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - start, end))) - vhost_umem_free(umem, node); -} - static void vhost_iotlb_notify_vq(struct vhost_dev *d, struct vhost_iotlb_msg *msg) { @@ -1120,9 +1056,9 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, break; } vhost_vq_meta_reset(dev); - if (vhost_new_umem_range(dev->iotlb, msg->iova, msg->size, - msg->iova + msg->size - 1, - msg->uaddr, msg->perm)) { + if (vhost_iotlb_add_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1, + msg->uaddr, msg->perm)) { ret = -ENOMEM; break; } @@ -1134,8 +1070,8 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev, break; } vhost_vq_meta_reset(dev); - vhost_del_umem_range(dev->iotlb, msg->iova, - msg->iova + msg->size - 1); + vhost_iotlb_del_range(dev->iotlb, msg->iova, + msg->iova + msg->size - 1); break; default: ret = -EINVAL; @@ -1319,44 +1255,42 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, } static void vhost_vq_meta_update(struct vhost_virtqueue *vq, - const struct vhost_umem_node *node, + const struct vhost_iotlb_map *map, int type) { int access = (type == VHOST_ADDR_USED) ? VHOST_ACCESS_WO : VHOST_ACCESS_RO; - if (likely(node->perm & access)) - vq->meta_iotlb[type] = node; + if (likely(map->perm & access)) + vq->meta_iotlb[type] = map; } static bool iotlb_access_ok(struct vhost_virtqueue *vq, int access, u64 addr, u64 len, int type) { - const struct vhost_umem_node *node; - struct vhost_umem *umem = vq->iotlb; + const struct vhost_iotlb_map *map; + struct vhost_iotlb *umem = vq->iotlb; u64 s = 0, size, orig_addr = addr, last = addr + len - 1; if (vhost_vq_meta_fetch(vq, addr, len, type)) return true; while (len > s) { - node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - addr, - last); - if (node == NULL || node->start > addr) { + map = vhost_iotlb_itree_first(umem, addr, last); + if (map == NULL || map->start > addr) { vhost_iotlb_miss(vq, addr, access); return false; - } else if (!(node->perm & access)) { + } else if (!(map->perm & access)) { /* Report the possible access violation by * request another translation from userspace. */ return false; } - size = node->size - addr + node->start; + size = map->size - addr + map->start; if (orig_addr == addr && size >= len) - vhost_vq_meta_update(vq, node, type); + vhost_vq_meta_update(vq, map, type); s += size; addr += size; @@ -1372,12 +1306,12 @@ int vq_meta_prefetch(struct vhost_virtqueue *vq) if (!vq->iotlb) return 1; - return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, + return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc, vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) && - iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail, + iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail, vhost_get_avail_size(vq, num), VHOST_ADDR_AVAIL) && - iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used, + iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used, vhost_get_used_size(vq, num), VHOST_ADDR_USED); } EXPORT_SYMBOL_GPL(vq_meta_prefetch); @@ -1416,25 +1350,11 @@ bool vhost_vq_access_ok(struct vhost_virtqueue *vq) } EXPORT_SYMBOL_GPL(vhost_vq_access_ok); -static struct vhost_umem *vhost_umem_alloc(void) -{ - struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL); - - if (!umem) - return NULL; - - umem->umem_tree = RB_ROOT_CACHED; - umem->numem = 0; - INIT_LIST_HEAD(&umem->umem_list); - - return umem; -} - static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem; struct vhost_memory_region *region; - struct vhost_umem *newumem, *oldumem; + struct vhost_iotlb *newumem, *oldumem; unsigned long size = offsetof(struct vhost_memory, regions); int i; @@ -1456,7 +1376,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return -EFAULT; } - newumem = vhost_umem_alloc(); + newumem = iotlb_alloc(); if (!newumem) { kvfree(newmem); return -ENOMEM; @@ -1465,13 +1385,12 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) for (region = newmem->regions; region < newmem->regions + mem.nregions; region++) { - if (vhost_new_umem_range(newumem, - region->guest_phys_addr, - region->memory_size, - region->guest_phys_addr + - region->memory_size - 1, - region->userspace_addr, - VHOST_ACCESS_RW)) + if (vhost_iotlb_add_range(newumem, + region->guest_phys_addr, + region->guest_phys_addr + + region->memory_size - 1, + region->userspace_addr, + VHOST_MAP_RW)) goto err; } @@ -1489,11 +1408,11 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) } kvfree(newmem); - vhost_umem_clean(oldumem); + vhost_iotlb_free(oldumem); return 0; err: - vhost_umem_clean(newumem); + vhost_iotlb_free(newumem); kvfree(newmem); return -EFAULT; } @@ -1734,10 +1653,10 @@ EXPORT_SYMBOL_GPL(vhost_vring_ioctl); int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) { - struct vhost_umem *niotlb, *oiotlb; + struct vhost_iotlb *niotlb, *oiotlb; int i; - niotlb = vhost_umem_alloc(); + niotlb = iotlb_alloc(); if (!niotlb) return -ENOMEM; @@ -1753,7 +1672,7 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled) mutex_unlock(&vq->mutex); } - vhost_umem_clean(oiotlb); + vhost_iotlb_free(oiotlb); return 0; } @@ -1883,8 +1802,8 @@ static int log_write(void __user *log_base, static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) { - struct vhost_umem *umem = vq->umem; - struct vhost_umem_node *u; + struct vhost_iotlb *umem = vq->umem; + struct vhost_iotlb_map *u; u64 start, end, l, min; int r; bool hit = false; @@ -1894,16 +1813,15 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) /* More than one GPAs can be mapped into a single HVA. So * iterate all possible umems here to be safe. */ - list_for_each_entry(u, &umem->umem_list, link) { - if (u->userspace_addr > hva - 1 + len || - u->userspace_addr - 1 + u->size < hva) + list_for_each_entry(u, &umem->list, link) { + if (u->addr > hva - 1 + len || + u->addr - 1 + u->size < hva) continue; - start = max(u->userspace_addr, hva); - end = min(u->userspace_addr - 1 + u->size, - hva - 1 + len); + start = max(u->addr, hva); + end = min(u->addr - 1 + u->size, hva - 1 + len); l = end - start + 1; r = log_write(vq->log_base, - u->start + start - u->userspace_addr, + u->start + start - u->addr, l); if (r < 0) return r; @@ -2054,9 +1972,9 @@ EXPORT_SYMBOL_GPL(vhost_vq_init_access); static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, struct iovec iov[], int iov_size, int access) { - const struct vhost_umem_node *node; + const struct vhost_iotlb_map *map; struct vhost_dev *dev = vq->dev; - struct vhost_umem *umem = dev->iotlb ? dev->iotlb : dev->umem; + struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem; struct iovec *_iov; u64 s = 0; int ret = 0; @@ -2068,25 +1986,24 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, break; } - node = vhost_umem_interval_tree_iter_first(&umem->umem_tree, - addr, addr + len - 1); - if (node == NULL || node->start > addr) { + map = vhost_iotlb_itree_first(umem, addr, addr + len - 1); + if (map == NULL || map->start > addr) { if (umem != dev->iotlb) { ret = -EFAULT; break; } ret = -EAGAIN; break; - } else if (!(node->perm & access)) { + } else if (!(map->perm & access)) { ret = -EPERM; break; } _iov = iov + ret; - size = node->size - addr + node->start; + size = map->size - addr + map->start; _iov->iov_len = min((u64)len - s, size); _iov->iov_base = (void __user *)(unsigned long) - (node->userspace_addr + addr - node->start); + (map->addr + addr - map->start); s += size; addr += size; ++ret; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f9d1a03dd153..181382185bbc 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -12,6 +12,7 @@ #include #include #include +#include struct vhost_work; typedef void (*vhost_work_fn_t)(struct vhost_work *work); @@ -52,27 +53,6 @@ struct vhost_log { u64 len; }; -#define START(node) ((node)->start) -#define LAST(node) ((node)->last) - -struct vhost_umem_node { - struct rb_node rb; - struct list_head link; - __u64 start; - __u64 last; - __u64 size; - __u64 userspace_addr; - __u32 perm; - __u32 flags_padding; - __u64 __subtree_last; -}; - -struct vhost_umem { - struct rb_root_cached umem_tree; - struct list_head umem_list; - int numem; -}; - enum vhost_uaddr_type { VHOST_ADDR_DESC = 0, VHOST_ADDR_AVAIL = 1, @@ -90,7 +70,7 @@ struct vhost_virtqueue { struct vring_desc __user *desc; struct vring_avail __user *avail; struct vring_used __user *used; - const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS]; + const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS]; struct file *kick; struct eventfd_ctx *call_ctx; struct eventfd_ctx *error_ctx; @@ -128,8 +108,8 @@ struct vhost_virtqueue { struct iovec *indirect; struct vring_used_elem *heads; /* Protected by virtqueue mutex. */ - struct vhost_umem *umem; - struct vhost_umem *iotlb; + struct vhost_iotlb *umem; + struct vhost_iotlb *iotlb; void *private_data; u64 acked_features; u64 acked_backend_features; @@ -164,8 +144,8 @@ struct vhost_dev { struct eventfd_ctx *log_ctx; struct llist_head work_list; struct task_struct *worker; - struct vhost_umem *umem; - struct vhost_umem *iotlb; + struct vhost_iotlb *umem; + struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; struct list_head read_list; struct list_head pending_list; @@ -186,8 +166,8 @@ void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, long vhost_dev_set_owner(struct vhost_dev *dev); bool vhost_dev_has_owner(struct vhost_dev *dev); long vhost_dev_check_owner(struct vhost_dev *); -struct vhost_umem *vhost_dev_reset_owner_prepare(void); -void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_umem *); +struct vhost_iotlb *vhost_dev_reset_owner_prepare(void); +void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *iotlb); void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); @@ -233,6 +213,9 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev, struct iov_iter *from); int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled); +void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, + struct vhost_iotlb_map *map); + #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \ diff --git a/include/linux/vhost_iotlb.h b/include/linux/vhost_iotlb.h new file mode 100644 index 000000000000..6b09b786a762 --- /dev/null +++ b/include/linux/vhost_iotlb.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VHOST_IOTLB_H +#define _LINUX_VHOST_IOTLB_H + +#include + +struct vhost_iotlb_map { + struct rb_node rb; + struct list_head link; + u64 start; + u64 last; + u64 size; + u64 addr; +#define VHOST_MAP_RO 0x1 +#define VHOST_MAP_WO 0x2 +#define VHOST_MAP_RW 0x3 + u32 perm; + u32 flags_padding; + u64 __subtree_last; +}; + +#define VHOST_IOTLB_FLAG_RETIRE 0x1 + +struct vhost_iotlb { + struct rb_root_cached root; + struct list_head list; + unsigned int limit; + unsigned int nmaps; + unsigned int flags; +}; + +int vhost_iotlb_add_range(struct vhost_iotlb *iotlb, u64 start, u64 last, + u64 addr, unsigned int perm); +void vhost_iotlb_del_range(struct vhost_iotlb *iotlb, u64 start, u64 last); + +struct vhost_iotlb *vhost_iotlb_alloc(unsigned int limit, unsigned int flags); +void vhost_iotlb_free(struct vhost_iotlb *iotlb); +void vhost_iotlb_reset(struct vhost_iotlb *iotlb); + +struct vhost_iotlb_map * +vhost_iotlb_itree_first(struct vhost_iotlb *iotlb, u64 start, u64 last); +struct vhost_iotlb_map * +vhost_iotlb_itree_next(struct vhost_iotlb_map *map, u64 start, u64 last); + +void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, + struct vhost_iotlb_map *map); +#endif -- cgit v1.2.3 From 9ad9c49cfe970b053bb0ef323b682dd1b4d4f8a0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 26 Mar 2020 22:01:20 +0800 Subject: vringh: IOTLB support This patch implements the third memory accessor for vringh besides current kernel and userspace accessors. This idea is to allow vringh to do the address translation through an IOTLB which is implemented via vhost_map interval tree. Users should setup and IOVA to PA mapping in this IOTLB. This allows us to: - Use vringh to access virtqueues with vIOMMU - Use vringh to implement software virtqueues for vDPA devices Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200326140125.19794-5-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/Kconfig | 1 + drivers/vhost/vringh.c | 421 ++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/vringh.h | 36 +++++ 3 files changed, 435 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 37400a1655b4..128238488078 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -6,6 +6,7 @@ config VHOST_IOTLB config VHOST_RING tristate + select VHOST_IOTLB help This option is selected by any driver which needs to access the host side of a virtio ring. diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index a0a2d74967ef..ee0491f579ac 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #include static __printf(1,2) __cold void vringh_bad(const char *fmt, ...) @@ -71,9 +74,11 @@ static inline int __vringh_get_head(const struct vringh *vrh, } /* Copy some bytes to/from the iovec. Returns num copied. */ -static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, +static inline ssize_t vringh_iov_xfer(struct vringh *vrh, + struct vringh_kiov *iov, void *ptr, size_t len, - int (*xfer)(void *addr, void *ptr, + int (*xfer)(const struct vringh *vrh, + void *addr, void *ptr, size_t len)) { int err, done = 0; @@ -82,7 +87,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, size_t partlen; partlen = min(iov->iov[iov->i].iov_len, len); - err = xfer(iov->iov[iov->i].iov_base, ptr, partlen); + err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen); if (err) return err; done += partlen; @@ -96,6 +101,7 @@ static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov, /* Fix up old iov element then increment. */ iov->iov[iov->i].iov_len = iov->consumed; iov->iov[iov->i].iov_base -= iov->consumed; + iov->consumed = 0; iov->i++; @@ -227,7 +233,8 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src, u64 addr, struct vringh_range *r), struct vringh_range *range, - int (*copy)(void *dst, const void *src, size_t len)) + int (*copy)(const struct vringh *vrh, + void *dst, const void *src, size_t len)) { size_t part, len = sizeof(struct vring_desc); @@ -241,7 +248,7 @@ static int slow_copy(struct vringh *vrh, void *dst, const void *src, if (!rcheck(vrh, addr, &part, range, getrange)) return -EINVAL; - err = copy(dst, src, part); + err = copy(vrh, dst, src, part); if (err) return err; @@ -262,7 +269,8 @@ __vringh_iov(struct vringh *vrh, u16 i, struct vringh_range *)), bool (*getrange)(struct vringh *, u64, struct vringh_range *), gfp_t gfp, - int (*copy)(void *dst, const void *src, size_t len)) + int (*copy)(const struct vringh *vrh, + void *dst, const void *src, size_t len)) { int err, count = 0, up_next, desc_max; struct vring_desc desc, *descs; @@ -291,7 +299,7 @@ __vringh_iov(struct vringh *vrh, u16 i, err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange, &slowrange, copy); else - err = copy(&desc, &descs[i], sizeof(desc)); + err = copy(vrh, &desc, &descs[i], sizeof(desc)); if (unlikely(err)) goto fail; @@ -404,7 +412,8 @@ static inline int __vringh_complete(struct vringh *vrh, unsigned int num_used, int (*putu16)(const struct vringh *vrh, __virtio16 *p, u16 val), - int (*putused)(struct vring_used_elem *dst, + int (*putused)(const struct vringh *vrh, + struct vring_used_elem *dst, const struct vring_used_elem *src, unsigned num)) { @@ -420,12 +429,12 @@ static inline int __vringh_complete(struct vringh *vrh, /* Compiler knows num_used == 1 sometimes, hence extra check */ if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) { u16 part = vrh->vring.num - off; - err = putused(&used_ring->ring[off], used, part); + err = putused(vrh, &used_ring->ring[off], used, part); if (!err) - err = putused(&used_ring->ring[0], used + part, + err = putused(vrh, &used_ring->ring[0], used + part, num_used - part); } else - err = putused(&used_ring->ring[off], used, num_used); + err = putused(vrh, &used_ring->ring[off], used, num_used); if (err) { vringh_bad("Failed to write %u used entries %u at %p", @@ -564,13 +573,15 @@ static inline int putu16_user(const struct vringh *vrh, __virtio16 *p, u16 val) return put_user(v, (__force __virtio16 __user *)p); } -static inline int copydesc_user(void *dst, const void *src, size_t len) +static inline int copydesc_user(const struct vringh *vrh, + void *dst, const void *src, size_t len) { return copy_from_user(dst, (__force void __user *)src, len) ? -EFAULT : 0; } -static inline int putused_user(struct vring_used_elem *dst, +static inline int putused_user(const struct vringh *vrh, + struct vring_used_elem *dst, const struct vring_used_elem *src, unsigned int num) { @@ -578,13 +589,15 @@ static inline int putused_user(struct vring_used_elem *dst, sizeof(*dst) * num) ? -EFAULT : 0; } -static inline int xfer_from_user(void *src, void *dst, size_t len) +static inline int xfer_from_user(const struct vringh *vrh, void *src, + void *dst, size_t len) { return copy_from_user(dst, (__force void __user *)src, len) ? -EFAULT : 0; } -static inline int xfer_to_user(void *dst, void *src, size_t len) +static inline int xfer_to_user(const struct vringh *vrh, + void *dst, void *src, size_t len) { return copy_to_user((__force void __user *)dst, src, len) ? -EFAULT : 0; @@ -706,7 +719,7 @@ EXPORT_SYMBOL(vringh_getdesc_user); */ ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len) { - return vringh_iov_xfer((struct vringh_kiov *)riov, + return vringh_iov_xfer(NULL, (struct vringh_kiov *)riov, dst, len, xfer_from_user); } EXPORT_SYMBOL(vringh_iov_pull_user); @@ -722,7 +735,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user); ssize_t vringh_iov_push_user(struct vringh_iov *wiov, const void *src, size_t len) { - return vringh_iov_xfer((struct vringh_kiov *)wiov, + return vringh_iov_xfer(NULL, (struct vringh_kiov *)wiov, (void *)src, len, xfer_to_user); } EXPORT_SYMBOL(vringh_iov_push_user); @@ -832,13 +845,15 @@ static inline int putu16_kern(const struct vringh *vrh, __virtio16 *p, u16 val) return 0; } -static inline int copydesc_kern(void *dst, const void *src, size_t len) +static inline int copydesc_kern(const struct vringh *vrh, + void *dst, const void *src, size_t len) { memcpy(dst, src, len); return 0; } -static inline int putused_kern(struct vring_used_elem *dst, +static inline int putused_kern(const struct vringh *vrh, + struct vring_used_elem *dst, const struct vring_used_elem *src, unsigned int num) { @@ -846,13 +861,15 @@ static inline int putused_kern(struct vring_used_elem *dst, return 0; } -static inline int xfer_kern(void *src, void *dst, size_t len) +static inline int xfer_kern(const struct vringh *vrh, void *src, + void *dst, size_t len) { memcpy(dst, src, len); return 0; } -static inline int kern_xfer(void *dst, void *src, size_t len) +static inline int kern_xfer(const struct vringh *vrh, void *dst, + void *src, size_t len) { memcpy(dst, src, len); return 0; @@ -949,7 +966,7 @@ EXPORT_SYMBOL(vringh_getdesc_kern); */ ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) { - return vringh_iov_xfer(riov, dst, len, xfer_kern); + return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern); } EXPORT_SYMBOL(vringh_iov_pull_kern); @@ -964,7 +981,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern); ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, const void *src, size_t len) { - return vringh_iov_xfer(wiov, (void *)src, len, kern_xfer); + return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer); } EXPORT_SYMBOL(vringh_iov_push_kern); @@ -1042,4 +1059,362 @@ int vringh_need_notify_kern(struct vringh *vrh) } EXPORT_SYMBOL(vringh_need_notify_kern); +static int iotlb_translate(const struct vringh *vrh, + u64 addr, u64 len, struct bio_vec iov[], + int iov_size, u32 perm) +{ + struct vhost_iotlb_map *map; + struct vhost_iotlb *iotlb = vrh->iotlb; + int ret = 0; + u64 s = 0; + + while (len > s) { + u64 size, pa, pfn; + + if (unlikely(ret >= iov_size)) { + ret = -ENOBUFS; + break; + } + + map = vhost_iotlb_itree_first(iotlb, addr, + addr + len - 1); + if (!map || map->start > addr) { + ret = -EINVAL; + break; + } else if (!(map->perm & perm)) { + ret = -EPERM; + break; + } + + size = map->size - addr + map->start; + pa = map->addr + addr - map->start; + pfn = pa >> PAGE_SHIFT; + iov[ret].bv_page = pfn_to_page(pfn); + iov[ret].bv_len = min(len - s, size); + iov[ret].bv_offset = pa & (PAGE_SIZE - 1); + s += size; + addr += size; + ++ret; + } + + return ret; +} + +static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, + void *src, size_t len) +{ + struct iov_iter iter; + struct bio_vec iov[16]; + int ret; + + ret = iotlb_translate(vrh, (u64)(uintptr_t)src, + len, iov, 16, VHOST_MAP_RO); + if (ret < 0) + return ret; + + iov_iter_bvec(&iter, READ, iov, ret, len); + + ret = copy_from_iter(dst, len, &iter); + + return ret; +} + +static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, + void *src, size_t len) +{ + struct iov_iter iter; + struct bio_vec iov[16]; + int ret; + + ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, + len, iov, 16, VHOST_MAP_WO); + if (ret < 0) + return ret; + + iov_iter_bvec(&iter, WRITE, iov, ret, len); + + return copy_to_iter(src, len, &iter); +} + +static inline int getu16_iotlb(const struct vringh *vrh, + u16 *val, const __virtio16 *p) +{ + struct bio_vec iov; + void *kaddr, *from; + int ret; + + /* Atomic read is needed for getu16 */ + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + &iov, 1, VHOST_MAP_RO); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(iov.bv_page); + from = kaddr + iov.bv_offset; + *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from)); + kunmap_atomic(kaddr); + + return 0; +} + +static inline int putu16_iotlb(const struct vringh *vrh, + __virtio16 *p, u16 val) +{ + struct bio_vec iov; + void *kaddr, *to; + int ret; + + /* Atomic write is needed for putu16 */ + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + &iov, 1, VHOST_MAP_WO); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(iov.bv_page); + to = kaddr + iov.bv_offset; + WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val)); + kunmap_atomic(kaddr); + + return 0; +} + +static inline int copydesc_iotlb(const struct vringh *vrh, + void *dst, const void *src, size_t len) +{ + int ret; + + ret = copy_from_iotlb(vrh, dst, (void *)src, len); + if (ret != len) + return -EFAULT; + + return 0; +} + +static inline int xfer_from_iotlb(const struct vringh *vrh, void *src, + void *dst, size_t len) +{ + int ret; + + ret = copy_from_iotlb(vrh, dst, src, len); + if (ret != len) + return -EFAULT; + + return 0; +} + +static inline int xfer_to_iotlb(const struct vringh *vrh, + void *dst, void *src, size_t len) +{ + int ret; + + ret = copy_to_iotlb(vrh, dst, src, len); + if (ret != len) + return -EFAULT; + + return 0; +} + +static inline int putused_iotlb(const struct vringh *vrh, + struct vring_used_elem *dst, + const struct vring_used_elem *src, + unsigned int num) +{ + int size = num * sizeof(*dst); + int ret; + + ret = copy_to_iotlb(vrh, dst, (void *)src, num * sizeof(*dst)); + if (ret != size) + return -EFAULT; + + return 0; +} + +/** + * vringh_init_iotlb - initialize a vringh for a ring with IOTLB. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userpace descriptor pointer. + * @avail: the userpace avail pointer. + * @used: the userpace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_iotlb(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + return vringh_init_kern(vrh, features, num, weak_barriers, + desc, avail, used); +} +EXPORT_SYMBOL(vringh_init_iotlb); + +/** + * vringh_set_iotlb - initialize a vringh for a ring with IOTLB. + * @vrh: the vring + * @iotlb: iotlb associated with this vring + */ +void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb) +{ + vrh->iotlb = iotlb; +} +EXPORT_SYMBOL(vringh_set_iotlb); + +/** + * vringh_getdesc_iotlb - get next available descriptor from ring with + * IOTLB. + * @vrh: the kernelspace vring. + * @riov: where to put the readable descriptors (or NULL) + * @wiov: where to put the writable descriptors (or NULL) + * @head: head index we received, for passing to vringh_complete_iotlb(). + * @gfp: flags for allocating larger riov/wiov. + * + * Returns 0 if there was no descriptor, 1 if there was, or -errno. + * + * Note that on error return, you can tell the difference between an + * invalid ring and a single invalid descriptor: in the former case, + * *head will be vrh->vring.num. You may be able to ignore an invalid + * descriptor, but there's not much you can do with an invalid ring. + * + * Note that you may need to clean up riov and wiov, even on error! + */ +int vringh_getdesc_iotlb(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp) +{ + int err; + + err = __vringh_get_head(vrh, getu16_iotlb, &vrh->last_avail_idx); + if (err < 0) + return err; + + /* Empty... */ + if (err == vrh->vring.num) + return 0; + + *head = err; + err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL, + gfp, copydesc_iotlb); + if (err) + return err; + + return 1; +} +EXPORT_SYMBOL(vringh_getdesc_iotlb); + +/** + * vringh_iov_pull_iotlb - copy bytes from vring_iov. + * @vrh: the vring. + * @riov: the riov as passed to vringh_getdesc_iotlb() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_pull_iotlb(struct vringh *vrh, + struct vringh_kiov *riov, + void *dst, size_t len) +{ + return vringh_iov_xfer(vrh, riov, dst, len, xfer_from_iotlb); +} +EXPORT_SYMBOL(vringh_iov_pull_iotlb); + +/** + * vringh_iov_push_iotlb - copy bytes into vring_iov. + * @vrh: the vring. + * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume) + * @dst: the place to copy. + * @len: the maximum length to copy. + * + * Returns the bytes copied <= len or a negative errno. + */ +ssize_t vringh_iov_push_iotlb(struct vringh *vrh, + struct vringh_kiov *wiov, + const void *src, size_t len) +{ + return vringh_iov_xfer(vrh, wiov, (void *)src, len, xfer_to_iotlb); +} +EXPORT_SYMBOL(vringh_iov_push_iotlb); + +/** + * vringh_abandon_iotlb - we've decided not to handle the descriptor(s). + * @vrh: the vring. + * @num: the number of descriptors to put back (ie. num + * vringh_get_iotlb() to undo). + * + * The next vringh_get_iotlb() will return the old descriptor(s) again. + */ +void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num) +{ + /* We only update vring_avail_event(vr) when we want to be notified, + * so we haven't changed that yet. + */ + vrh->last_avail_idx -= num; +} +EXPORT_SYMBOL(vringh_abandon_iotlb); + +/** + * vringh_complete_iotlb - we've finished with descriptor, publish it. + * @vrh: the vring. + * @head: the head as filled in by vringh_getdesc_iotlb. + * @len: the length of data we have written. + * + * You should check vringh_need_notify_iotlb() after one or more calls + * to this function. + */ +int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len) +{ + struct vring_used_elem used; + + used.id = cpu_to_vringh32(vrh, head); + used.len = cpu_to_vringh32(vrh, len); + + return __vringh_complete(vrh, &used, 1, putu16_iotlb, putused_iotlb); +} +EXPORT_SYMBOL(vringh_complete_iotlb); + +/** + * vringh_notify_enable_iotlb - we want to know if something changes. + * @vrh: the vring. + * + * This always enables notifications, but returns false if there are + * now more buffers available in the vring. + */ +bool vringh_notify_enable_iotlb(struct vringh *vrh) +{ + return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb); +} +EXPORT_SYMBOL(vringh_notify_enable_iotlb); + +/** + * vringh_notify_disable_iotlb - don't tell us if something changes. + * @vrh: the vring. + * + * This is our normal running state: we disable and then only enable when + * we're going to sleep. + */ +void vringh_notify_disable_iotlb(struct vringh *vrh) +{ + __vringh_notify_disable(vrh, putu16_iotlb); +} +EXPORT_SYMBOL(vringh_notify_disable_iotlb); + +/** + * vringh_need_notify_iotlb - must we tell the other side about used buffers? + * @vrh: the vring we've called vringh_complete_iotlb() on. + * + * Returns -errno or 0 if we don't need to tell the other side, 1 if we do. + */ +int vringh_need_notify_iotlb(struct vringh *vrh) +{ + return __vringh_need_notify(vrh, getu16_iotlb); +} +EXPORT_SYMBOL(vringh_need_notify_iotlb); + + MODULE_LICENSE("GPL"); diff --git a/include/linux/vringh.h b/include/linux/vringh.h index d237087eb257..bd0503ca6f8f 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include /* virtio_ring with information needed for host access. */ @@ -39,6 +41,9 @@ struct vringh { /* The vring (note: it may contain user pointers!) */ struct vring vring; + /* IOTLB for this vring */ + struct vhost_iotlb *iotlb; + /* The function to call to notify the guest about added buffers */ void (*notify)(struct vringh *); }; @@ -248,4 +253,35 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val) { return __cpu_to_virtio64(vringh_is_little_endian(vrh), val); } + +void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb); + +int vringh_init_iotlb(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used); + +int vringh_getdesc_iotlb(struct vringh *vrh, + struct vringh_kiov *riov, + struct vringh_kiov *wiov, + u16 *head, + gfp_t gfp); + +ssize_t vringh_iov_pull_iotlb(struct vringh *vrh, + struct vringh_kiov *riov, + void *dst, size_t len); +ssize_t vringh_iov_push_iotlb(struct vringh *vrh, + struct vringh_kiov *wiov, + const void *src, size_t len); + +void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num); + +int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len); + +bool vringh_notify_enable_iotlb(struct vringh *vrh); +void vringh_notify_disable_iotlb(struct vringh *vrh); + +int vringh_need_notify_iotlb(struct vringh *vrh); + #endif /* _LINUX_VRINGH_H */ -- cgit v1.2.3 From 961e9c84077f6c8579d7a628cbe94a675cb67ae4 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 26 Mar 2020 22:01:21 +0800 Subject: vDPA: introduce vDPA bus vDPA device is a device that uses a datapath which complies with the virtio specifications with vendor specific control path. vDPA devices can be both physically located on the hardware or emulated by software. vDPA hardware devices are usually implemented through PCIE with the following types: - PF (Physical Function) - A single Physical Function - VF (Virtual Function) - Device that supports single root I/O virtualization (SR-IOV). Its Virtual Function (VF) represents a virtualized instance of the device that can be assigned to different partitions - ADI (Assignable Device Interface) and its equivalents - With technologies such as Intel Scalable IOV, a virtual device (VDEV) composed by host OS utilizing one or more ADIs. Or its equivalent like SF (Sub function) from Mellanox. >From a driver's perspective, depends on how and where the DMA translation is done, vDPA devices are split into two types: - Platform specific DMA translation - From the driver's perspective, the device can be used on a platform where device access to data in memory is limited and/or translated. An example is a PCIE vDPA whose DMA request was tagged via a bus (e.g PCIE) specific way. DMA translation and protection are done at PCIE bus IOMMU level. - Device specific DMA translation - The device implements DMA isolation and protection through its own logic. An example is a vDPA device which uses on-chip IOMMU. To hide the differences and complexity of the above types for a vDPA device/IOMMU options and in order to present a generic virtio device to the upper layer, a device agnostic framework is required. This patch introduces a software vDPA bus which abstracts the common attributes of vDPA device, vDPA bus driver and the communication method (vdpa_config_ops) between the vDPA device abstraction and the vDPA bus driver. This allows multiple types of drivers to be used for vDPA device like the virtio_vdpa and vhost_vdpa driver to operate on the bus and allow vDPA device could be used by either kernel virtio driver or userspace vhost drivers as: virtio drivers vhost drivers | | [virtio bus] [vhost uAPI] | | virtio device vhost device virtio_vdpa drv vhost_vdpa drv \ / [vDPA bus] | vDPA device hardware drv | [hardware bus] | vDPA hardware With the abstraction of vDPA bus and vDPA bus operations, the difference and complexity of the under layer hardware is hidden from upper layer. The vDPA bus drivers on top can use a unified vdpa_config_ops to control different types of vDPA device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200326140125.19794-6-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 1 + drivers/virtio/Kconfig | 2 + drivers/virtio/Makefile | 1 + drivers/virtio/vdpa/Kconfig | 7 ++ drivers/virtio/vdpa/Makefile | 2 + drivers/virtio/vdpa/vdpa.c | 180 ++++++++++++++++++++++++++++++ include/linux/vdpa.h | 253 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 446 insertions(+) create mode 100644 drivers/virtio/vdpa/Kconfig create mode 100644 drivers/virtio/vdpa/Makefile create mode 100644 drivers/virtio/vdpa/vdpa.c create mode 100644 include/linux/vdpa.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 19363ed5e723..70c47bc55343 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17699,6 +17699,7 @@ F: tools/virtio/ F: drivers/net/virtio_net.c F: drivers/block/virtio_blk.c F: include/linux/virtio*.h +F: include/linux/vdpa.h F: include/uapi/linux/virtio_*.h F: drivers/crypto/virtio/ F: mm/balloon_compaction.c diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 078615cf2afc..9c4fdb64d9ac 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -96,3 +96,5 @@ config VIRTIO_MMIO_CMDLINE_DEVICES If unsure, say 'N'. endif # VIRTIO_MENU + +source "drivers/virtio/vdpa/Kconfig" diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 3a2b5c5dcf46..fdf5eacd0d0a 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o virtio_pci-$(CONFIG_VIRTIO_PCI_LEGACY) += virtio_pci_legacy.o obj-$(CONFIG_VIRTIO_BALLOON) += virtio_balloon.o obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o +obj-$(CONFIG_VDPA) += vdpa/ diff --git a/drivers/virtio/vdpa/Kconfig b/drivers/virtio/vdpa/Kconfig new file mode 100644 index 000000000000..351617723d12 --- /dev/null +++ b/drivers/virtio/vdpa/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +config VDPA + tristate + help + Enable this module to support vDPA device that uses a + datapath which complies with virtio specifications with + vendor specific control path. diff --git a/drivers/virtio/vdpa/Makefile b/drivers/virtio/vdpa/Makefile new file mode 100644 index 000000000000..ee6a35e8a4fb --- /dev/null +++ b/drivers/virtio/vdpa/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VDPA) += vdpa.o diff --git a/drivers/virtio/vdpa/vdpa.c b/drivers/virtio/vdpa/vdpa.c new file mode 100644 index 000000000000..e9ed6a2b635b --- /dev/null +++ b/drivers/virtio/vdpa/vdpa.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vDPA bus. + * + * Copyright (c) 2020, Red Hat. All rights reserved. + * Author: Jason Wang + * + */ + +#include +#include +#include +#include + +static DEFINE_IDA(vdpa_index_ida); + +static int vdpa_dev_probe(struct device *d) +{ + struct vdpa_device *vdev = dev_to_vdpa(d); + struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); + int ret = 0; + + if (drv && drv->probe) + ret = drv->probe(vdev); + + return ret; +} + +static int vdpa_dev_remove(struct device *d) +{ + struct vdpa_device *vdev = dev_to_vdpa(d); + struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); + + if (drv && drv->remove) + drv->remove(vdev); + + return 0; +} + +static struct bus_type vdpa_bus = { + .name = "vdpa", + .probe = vdpa_dev_probe, + .remove = vdpa_dev_remove, +}; + +static void vdpa_release_dev(struct device *d) +{ + struct vdpa_device *vdev = dev_to_vdpa(d); + const struct vdpa_config_ops *ops = vdev->config; + + if (ops->free) + ops->free(vdev); + + ida_simple_remove(&vdpa_index_ida, vdev->index); + kfree(vdev); +} + +/** + * __vdpa_alloc_device - allocate and initilaize a vDPA device + * This allows driver to some prepartion after device is + * initialized but before registered. + * @parent: the parent device + * @config: the bus operations that is supported by this device + * @size: size of the parent structure that contains private data + * + * Drvier should use vdap_alloc_device() wrapper macro instead of + * using this directly. + * + * Returns an error when parent/config/dma_dev is not set or fail to get + * ida. + */ +struct vdpa_device *__vdpa_alloc_device(struct device *parent, + const struct vdpa_config_ops *config, + size_t size) +{ + struct vdpa_device *vdev; + int err = -EINVAL; + + if (!config) + goto err; + + if (!!config->dma_map != !!config->dma_unmap) + goto err; + + err = -ENOMEM; + vdev = kzalloc(size, GFP_KERNEL); + if (!vdev) + goto err; + + err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL); + if (err < 0) + goto err_ida; + + vdev->dev.bus = &vdpa_bus; + vdev->dev.parent = parent; + vdev->dev.release = vdpa_release_dev; + vdev->index = err; + vdev->config = config; + + err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); + if (err) + goto err_name; + + device_initialize(&vdev->dev); + + return vdev; + +err_name: + ida_simple_remove(&vdpa_index_ida, vdev->index); +err_ida: + kfree(vdev); +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(__vdpa_alloc_device); + +/** + * vdpa_register_device - register a vDPA device + * Callers must have a succeed call of vdpa_init_device() before. + * @vdev: the vdpa device to be registered to vDPA bus + * + * Returns an error when fail to add to vDPA bus + */ +int vdpa_register_device(struct vdpa_device *vdev) +{ + return device_add(&vdev->dev); +} +EXPORT_SYMBOL_GPL(vdpa_register_device); + +/** + * vdpa_unregister_device - unregister a vDPA device + * @vdev: the vdpa device to be unregisted from vDPA bus + */ +void vdpa_unregister_device(struct vdpa_device *vdev) +{ + device_unregister(&vdev->dev); +} +EXPORT_SYMBOL_GPL(vdpa_unregister_device); + +/** + * __vdpa_register_driver - register a vDPA device driver + * @drv: the vdpa device driver to be registered + * @owner: module owner of the driver + * + * Returns an err when fail to do the registration + */ +int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) +{ + drv->driver.bus = &vdpa_bus; + drv->driver.owner = owner; + + return driver_register(&drv->driver); +} +EXPORT_SYMBOL_GPL(__vdpa_register_driver); + +/** + * vdpa_unregister_driver - unregister a vDPA device driver + * @drv: the vdpa device driver to be unregistered + */ +void vdpa_unregister_driver(struct vdpa_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL_GPL(vdpa_unregister_driver); + +static int vdpa_init(void) +{ + return bus_register(&vdpa_bus); +} + +static void __exit vdpa_exit(void) +{ + bus_unregister(&vdpa_bus); + ida_destroy(&vdpa_index_ida); +} +core_initcall(vdpa_init); +module_exit(vdpa_exit); + +MODULE_AUTHOR("Jason Wang "); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h new file mode 100644 index 000000000000..733acfb7ef84 --- /dev/null +++ b/include/linux/vdpa.h @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VDPA_H +#define _LINUX_VDPA_H + +#include +#include +#include +#include + +/** + * vDPA callback definition. + * @callback: interrupt callback function + * @private: the data passed to the callback function + */ +struct vdpa_callback { + irqreturn_t (*callback)(void *data); + void *private; +}; + +/** + * vDPA device - representation of a vDPA device + * @dev: underlying device + * @dma_dev: the actual device that is performing DMA + * @config: the configuration ops for this device. + * @index: device index + */ +struct vdpa_device { + struct device dev; + struct device *dma_dev; + const struct vdpa_config_ops *config; + unsigned int index; +}; + +/** + * vDPA_config_ops - operations for configuring a vDPA device. + * Note: vDPA device drivers are required to implement all of the + * operations unless it is mentioned to be optional in the following + * list. + * + * @set_vq_address: Set the address of virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * @desc_area: address of desc area + * @driver_area: address of driver area + * @device_area: address of device area + * Returns integer: success (0) or error (< 0) + * @set_vq_num: Set the size of virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * @num: the size of virtqueue + * @kick_vq: Kick the virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * @set_vq_cb: Set the interrupt callback function for + * a virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * @cb: virtio-vdev interrupt callback structure + * @set_vq_ready: Set ready status for a virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * @ready: ready (true) not ready(false) + * @get_vq_ready: Get ready status for a virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * Returns boolean: ready (true) or not (false) + * @set_vq_state: Set the state for a virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * @state: virtqueue state (last_avail_idx) + * Returns integer: success (0) or error (< 0) + * @get_vq_state: Get the state for a virtqueue + * @vdev: vdpa device + * @idx: virtqueue index + * Returns virtqueue state (last_avail_idx) + * @get_vq_align: Get the virtqueue align requirement + * for the device + * @vdev: vdpa device + * Returns virtqueue algin requirement + * @get_features: Get virtio features supported by the device + * @vdev: vdpa device + * Returns the virtio features support by the + * device + * @set_features: Set virtio features supported by the driver + * @vdev: vdpa device + * @features: feature support by the driver + * Returns integer: success (0) or error (< 0) + * @set_config_cb: Set the config interrupt callback + * @vdev: vdpa device + * @cb: virtio-vdev interrupt callback structure + * @get_vq_num_max: Get the max size of virtqueue + * @vdev: vdpa device + * Returns u16: max size of virtqueue + * @get_device_id: Get virtio device id + * @vdev: vdpa device + * Returns u32: virtio device id + * @get_vendor_id: Get id for the vendor that provides this device + * @vdev: vdpa device + * Returns u32: virtio vendor id + * @get_status: Get the device status + * @vdev: vdpa device + * Returns u8: virtio device status + * @set_status: Set the device status + * @vdev: vdpa device + * @status: virtio device status + * @get_config: Read from device specific configuration space + * @vdev: vdpa device + * @offset: offset from the beginning of + * configuration space + * @buf: buffer used to read to + * @len: the length to read from + * configuration space + * @set_config: Write to device specific configuration space + * @vdev: vdpa device + * @offset: offset from the beginning of + * configuration space + * @buf: buffer used to write from + * @len: the length to write to + * configuration space + * @get_generation: Get device config generation (optional) + * @vdev: vdpa device + * Returns u32: device generation + * @set_map: Set device memory mapping (optional) + * Needed for device that using device + * specific DMA translation (on-chip IOMMU) + * @vdev: vdpa device + * @iotlb: vhost memory mapping to be + * used by the vDPA + * Returns integer: success (0) or error (< 0) + * @dma_map: Map an area of PA to IOVA (optional) + * Needed for device that using device + * specific DMA translation (on-chip IOMMU) + * and preferring incremental map. + * @vdev: vdpa device + * @iova: iova to be mapped + * @size: size of the area + * @pa: physical address for the map + * @perm: device access permission (VHOST_MAP_XX) + * Returns integer: success (0) or error (< 0) + * @dma_unmap: Unmap an area of IOVA (optional but + * must be implemented with dma_map) + * Needed for device that using device + * specific DMA translation (on-chip IOMMU) + * and preferring incremental unmap. + * @vdev: vdpa device + * @iova: iova to be unmapped + * @size: size of the area + * Returns integer: success (0) or error (< 0) + * @free: Free resources that belongs to vDPA (optional) + * @vdev: vdpa device + */ +struct vdpa_config_ops { + /* Virtqueue ops */ + int (*set_vq_address)(struct vdpa_device *vdev, + u16 idx, u64 desc_area, u64 driver_area, + u64 device_area); + void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num); + void (*kick_vq)(struct vdpa_device *vdev, u16 idx); + void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx, + struct vdpa_callback *cb); + void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready); + bool (*get_vq_ready)(struct vdpa_device *vdev, u16 idx); + int (*set_vq_state)(struct vdpa_device *vdev, u16 idx, u64 state); + u64 (*get_vq_state)(struct vdpa_device *vdev, u16 idx); + + /* Device ops */ + u16 (*get_vq_align)(struct vdpa_device *vdev); + u64 (*get_features)(struct vdpa_device *vdev); + int (*set_features)(struct vdpa_device *vdev, u64 features); + void (*set_config_cb)(struct vdpa_device *vdev, + struct vdpa_callback *cb); + u16 (*get_vq_num_max)(struct vdpa_device *vdev); + u32 (*get_device_id)(struct vdpa_device *vdev); + u32 (*get_vendor_id)(struct vdpa_device *vdev); + u8 (*get_status)(struct vdpa_device *vdev); + void (*set_status)(struct vdpa_device *vdev, u8 status); + void (*get_config)(struct vdpa_device *vdev, unsigned int offset, + void *buf, unsigned int len); + void (*set_config)(struct vdpa_device *vdev, unsigned int offset, + const void *buf, unsigned int len); + u32 (*get_generation)(struct vdpa_device *vdev); + + /* DMA ops */ + int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); + int (*dma_map)(struct vdpa_device *vdev, u64 iova, u64 size, + u64 pa, u32 perm); + int (*dma_unmap)(struct vdpa_device *vdev, u64 iova, u64 size); + + /* Free device resources */ + void (*free)(struct vdpa_device *vdev); +}; + +struct vdpa_device *__vdpa_alloc_device(struct device *parent, + const struct vdpa_config_ops *config, + size_t size); + +#define vdpa_alloc_device(dev_struct, member, parent, config) \ + container_of(__vdpa_alloc_device( \ + parent, config, \ + sizeof(dev_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + dev_struct, member))), \ + dev_struct, member) + +int vdpa_register_device(struct vdpa_device *vdev); +void vdpa_unregister_device(struct vdpa_device *vdev); + +/** + * vdpa_driver - operations for a vDPA driver + * @driver: underlying device driver + * @probe: the function to call when a device is found. Returns 0 or -errno. + * @remove: the function to call when a device is removed. + */ +struct vdpa_driver { + struct device_driver driver; + int (*probe)(struct vdpa_device *vdev); + void (*remove)(struct vdpa_device *vdev); +}; + +#define vdpa_register_driver(drv) \ + __vdpa_register_driver(drv, THIS_MODULE) +int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner); +void vdpa_unregister_driver(struct vdpa_driver *drv); + +#define module_vdpa_driver(__vdpa_driver) \ + module_driver(__vdpa_driver, vdpa_register_driver, \ + vdpa_unregister_driver) + +static inline struct vdpa_driver *drv_to_vdpa(struct device_driver *driver) +{ + return container_of(driver, struct vdpa_driver, driver); +} + +static inline struct vdpa_device *dev_to_vdpa(struct device *_dev) +{ + return container_of(_dev, struct vdpa_device, dev); +} + +static inline void *vdpa_get_drvdata(const struct vdpa_device *vdev) +{ + return dev_get_drvdata(&vdev->dev); +} + +static inline void vdpa_set_drvdata(struct vdpa_device *vdev, void *data) +{ + dev_set_drvdata(&vdev->dev, data); +} + +static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) +{ + return vdev->dma_dev; +} +#endif /* _LINUX_VDPA_H */ -- cgit v1.2.3 From 4c8cf31885f69e86be0b5b9e6677a26797365e1d Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Thu, 26 Mar 2020 22:01:23 +0800 Subject: vhost: introduce vDPA-based backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a vDPA-based vhost backend. This backend is built on top of the same interface defined in virtio-vDPA and provides a generic vhost interface for userspace to accelerate the virtio devices in guest. This backend is implemented as a vDPA device driver on top of the same ops used in virtio-vDPA. It will create char device entry named vhost-vdpa-$index for userspace to use. Userspace can use vhost ioctls on top of this char device to setup the backend. Vhost ioctls are extended to make it type agnostic and behave like a virtio device, this help to eliminate type specific API like what vhost_net/scsi/vsock did: - VHOST_VDPA_GET_DEVICE_ID: get the virtio device ID which is defined by virtio specification to differ from different type of devices - VHOST_VDPA_GET_VRING_NUM: get the maximum size of virtqueue supported by the vDPA device - VHSOT_VDPA_SET/GET_STATUS: set and get virtio status of vDPA device - VHOST_VDPA_SET/GET_CONFIG: access virtio config space - VHOST_VDPA_SET_VRING_ENABLE: enable a specific virtqueue For memory mapping, IOTLB API is mandated for vhost-vDPA which means userspace drivers are required to use VHOST_IOTLB_UPDATE/VHOST_IOTLB_INVALIDATE to add or remove mapping for a specific userspace memory region. The vhost-vDPA API is designed to be type agnostic, but it allows net device only in current stage. Due to the lacking of control virtqueue support, some features were filter out by vhost-vdpa. We will enable more features and devices in the near future. Signed-off-by: Tiwei Bie Signed-off-by: Eugenio PĂ©rez Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20200326140125.19794-8-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/Kconfig | 12 + drivers/vhost/Makefile | 3 + drivers/vhost/vdpa.c | 883 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 24 ++ include/uapi/linux/vhost_types.h | 8 + 5 files changed, 930 insertions(+) create mode 100644 drivers/vhost/vdpa.c (limited to 'include') diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 128238488078..362b832f5338 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -59,6 +59,18 @@ config VHOST_VSOCK To compile this driver as a module, choose M here: the module will be called vhost_vsock. +config VHOST_VDPA + tristate "Vhost driver for vDPA-based backend" + depends on EVENTFD + select VHOST + select VDPA + help + This kernel module can be loaded in host kernel to accelerate + guest virtio devices with the vDPA-based backends. + + To compile this driver as a module, choose M here: the module + will be called vhost_vdpa. + config VHOST_CROSS_ENDIAN_LEGACY bool "Cross-endian support for vhost" default n diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index fb831002bcf0..f3e1897cce85 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -10,6 +10,9 @@ vhost_vsock-y := vsock.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o +vhost_vdpa-y := vdpa.o + obj-$(CONFIG_VHOST) += vhost.o obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c new file mode 100644 index 000000000000..421f02a8530a --- /dev/null +++ b/drivers/vhost/vdpa.c @@ -0,0 +1,883 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018-2020 Intel Corporation. + * Copyright (C) 2020 Red Hat, Inc. + * + * Author: Tiwei Bie + * Jason Wang + * + * Thanks Michael S. Tsirkin for the valuable comments and + * suggestions. And thanks to Cunming Liang and Zhihong Wang for all + * their supports. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +enum { + VHOST_VDPA_FEATURES = + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1ULL << VIRTIO_F_ANY_LAYOUT) | + (1ULL << VIRTIO_F_VERSION_1) | + (1ULL << VIRTIO_F_IOMMU_PLATFORM) | + (1ULL << VIRTIO_F_RING_PACKED) | + (1ULL << VIRTIO_F_ORDER_PLATFORM) | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX), + + VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES | + (1ULL << VIRTIO_NET_F_CSUM) | + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | + (1ULL << VIRTIO_NET_F_MTU) | + (1ULL << VIRTIO_NET_F_MAC) | + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | + (1ULL << VIRTIO_NET_F_GUEST_ECN) | + (1ULL << VIRTIO_NET_F_GUEST_UFO) | + (1ULL << VIRTIO_NET_F_HOST_TSO4) | + (1ULL << VIRTIO_NET_F_HOST_TSO6) | + (1ULL << VIRTIO_NET_F_HOST_ECN) | + (1ULL << VIRTIO_NET_F_HOST_UFO) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_NET_F_STATUS) | + (1ULL << VIRTIO_NET_F_SPEED_DUPLEX), +}; + +/* Currently, only network backend w/o multiqueue is supported. */ +#define VHOST_VDPA_VQ_MAX 2 + +#define VHOST_VDPA_DEV_MAX (1U << MINORBITS) + +struct vhost_vdpa { + struct vhost_dev vdev; + struct iommu_domain *domain; + struct vhost_virtqueue *vqs; + struct completion completion; + struct vdpa_device *vdpa; + struct device dev; + struct cdev cdev; + atomic_t opened; + int nvqs; + int virtio_id; + int minor; +}; + +static DEFINE_IDA(vhost_vdpa_ida); + +static dev_t vhost_vdpa_major; + +static const u64 vhost_vdpa_features[] = { + [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES, +}; + +static void handle_vq_kick(struct vhost_work *work) +{ + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev); + const struct vdpa_config_ops *ops = v->vdpa->config; + + ops->kick_vq(v->vdpa, vq - v->vqs); +} + +static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) +{ + struct vhost_virtqueue *vq = private; + struct eventfd_ctx *call_ctx = vq->call_ctx; + + if (call_ctx) + eventfd_signal(call_ctx, 1); + + return IRQ_HANDLED; +} + +static void vhost_vdpa_reset(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + ops->set_status(vdpa, 0); +} + +static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u32 device_id; + + device_id = ops->get_device_id(vdpa); + + if (copy_to_user(argp, &device_id, sizeof(device_id))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u8 status; + + status = ops->get_status(vdpa); + + if (copy_to_user(statusp, &status, sizeof(status))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u8 status; + + if (copy_from_user(&status, statusp, sizeof(status))) + return -EFAULT; + + /* + * Userspace shouldn't remove status bits unless reset the + * status to 0. + */ + if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) + return -EINVAL; + + ops->set_status(vdpa, status); + + return 0; +} + +static int vhost_vdpa_config_validate(struct vhost_vdpa *v, + struct vhost_vdpa_config *c) +{ + long size = 0; + + switch (v->virtio_id) { + case VIRTIO_ID_NET: + size = sizeof(struct virtio_net_config); + break; + } + + if (c->len == 0) + return -EINVAL; + + if (c->len > size - c->off) + return -E2BIG; + + return 0; +} + +static long vhost_vdpa_get_config(struct vhost_vdpa *v, + struct vhost_vdpa_config __user *c) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa_config config; + unsigned long size = offsetof(struct vhost_vdpa_config, buf); + u8 *buf; + + if (copy_from_user(&config, c, size)) + return -EFAULT; + if (vhost_vdpa_config_validate(v, &config)) + return -EINVAL; + buf = kvzalloc(config.len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ops->get_config(vdpa, config.off, buf, config.len); + + if (copy_to_user(c->buf, buf, config.len)) { + kvfree(buf); + return -EFAULT; + } + + kvfree(buf); + return 0; +} + +static long vhost_vdpa_set_config(struct vhost_vdpa *v, + struct vhost_vdpa_config __user *c) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa_config config; + unsigned long size = offsetof(struct vhost_vdpa_config, buf); + u8 *buf; + + if (copy_from_user(&config, c, size)) + return -EFAULT; + if (vhost_vdpa_config_validate(v, &config)) + return -EINVAL; + buf = kvzalloc(config.len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, c->buf, config.len)) { + kvfree(buf); + return -EFAULT; + } + + ops->set_config(vdpa, config.off, buf, config.len); + + kvfree(buf); + return 0; +} + +static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u64 features; + + features = ops->get_features(vdpa); + features &= vhost_vdpa_features[v->virtio_id]; + + if (copy_to_user(featurep, &features, sizeof(features))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u64 features; + + /* + * It's not allowed to change the features after they have + * been negotiated. + */ + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK) + return -EBUSY; + + if (copy_from_user(&features, featurep, sizeof(features))) + return -EFAULT; + + if (features & ~vhost_vdpa_features[v->virtio_id]) + return -EINVAL; + + if (ops->set_features(vdpa, features)) + return -EINVAL; + + return 0; +} + +static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + u16 num; + + num = ops->get_vq_num_max(vdpa); + + if (copy_to_user(argp, &num, sizeof(num))) + return -EFAULT; + + return 0; +} + +static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, + void __user *argp) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct vdpa_callback cb; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + u8 status; + u32 idx; + long r; + + r = get_user(idx, (u32 __user *)argp); + if (r < 0) + return r; + + if (idx >= v->nvqs) + return -ENOBUFS; + + idx = array_index_nospec(idx, v->nvqs); + vq = &v->vqs[idx]; + + status = ops->get_status(vdpa); + + if (cmd == VHOST_VDPA_SET_VRING_ENABLE) { + if (copy_from_user(&s, argp, sizeof(s))) + return -EFAULT; + ops->set_vq_ready(vdpa, idx, s.num); + return 0; + } + + if (cmd == VHOST_GET_VRING_BASE) + vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx); + + r = vhost_vring_ioctl(&v->vdev, cmd, argp); + if (r) + return r; + + switch (cmd) { + case VHOST_SET_VRING_ADDR: + if (ops->set_vq_address(vdpa, idx, + (u64)(uintptr_t)vq->desc, + (u64)(uintptr_t)vq->avail, + (u64)(uintptr_t)vq->used)) + r = -EINVAL; + break; + + case VHOST_SET_VRING_BASE: + if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx)) + r = -EINVAL; + break; + + case VHOST_SET_VRING_CALL: + if (vq->call_ctx) { + cb.callback = vhost_vdpa_virtqueue_cb; + cb.private = vq; + } else { + cb.callback = NULL; + cb.private = NULL; + } + ops->set_vq_cb(vdpa, idx, &cb); + break; + + case VHOST_SET_VRING_NUM: + ops->set_vq_num(vdpa, idx, vq->num); + break; + } + + return r; +} + +static long vhost_vdpa_unlocked_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vhost_vdpa *v = filep->private_data; + struct vhost_dev *d = &v->vdev; + void __user *argp = (void __user *)arg; + long r; + + mutex_lock(&d->mutex); + + switch (cmd) { + case VHOST_VDPA_GET_DEVICE_ID: + r = vhost_vdpa_get_device_id(v, argp); + break; + case VHOST_VDPA_GET_STATUS: + r = vhost_vdpa_get_status(v, argp); + break; + case VHOST_VDPA_SET_STATUS: + r = vhost_vdpa_set_status(v, argp); + break; + case VHOST_VDPA_GET_CONFIG: + r = vhost_vdpa_get_config(v, argp); + break; + case VHOST_VDPA_SET_CONFIG: + r = vhost_vdpa_set_config(v, argp); + break; + case VHOST_GET_FEATURES: + r = vhost_vdpa_get_features(v, argp); + break; + case VHOST_SET_FEATURES: + r = vhost_vdpa_set_features(v, argp); + break; + case VHOST_VDPA_GET_VRING_NUM: + r = vhost_vdpa_get_vring_num(v, argp); + break; + case VHOST_SET_LOG_BASE: + case VHOST_SET_LOG_FD: + r = -ENOIOCTLCMD; + break; + default: + r = vhost_dev_ioctl(&v->vdev, cmd, argp); + if (r == -ENOIOCTLCMD) + r = vhost_vdpa_vring_ioctl(v, cmd, argp); + break; + } + + mutex_unlock(&d->mutex); + return r; +} + +static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb_map *map; + struct page *page; + unsigned long pfn, pinned; + + while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { + pinned = map->size >> PAGE_SHIFT; + for (pfn = map->addr >> PAGE_SHIFT; + pinned > 0; pfn++, pinned--) { + page = pfn_to_page(pfn); + if (map->perm & VHOST_ACCESS_WO) + set_page_dirty_lock(page); + unpin_user_page(page); + } + atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm); + vhost_iotlb_map_free(iotlb, map); + } +} + +static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) +{ + struct vhost_dev *dev = &v->vdev; + + vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); + kfree(dev->iotlb); + dev->iotlb = NULL; +} + +static int perm_to_iommu_flags(u32 perm) +{ + int flags = 0; + + switch (perm) { + case VHOST_ACCESS_WO: + flags |= IOMMU_WRITE; + break; + case VHOST_ACCESS_RO: + flags |= IOMMU_READ; + break; + case VHOST_ACCESS_RW: + flags |= (IOMMU_WRITE | IOMMU_READ); + break; + default: + WARN(1, "invalidate vhost IOTLB permission\n"); + break; + } + + return flags | IOMMU_CACHE; +} + +static int vhost_vdpa_map(struct vhost_vdpa *v, + u64 iova, u64 size, u64 pa, u32 perm) +{ + struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + int r = 0; + + r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1, + pa, perm); + if (r) + return r; + + if (ops->dma_map) + r = ops->dma_map(vdpa, iova, size, pa, perm); + else if (ops->set_map) + r = ops->set_map(vdpa, dev->iotlb); + else + r = iommu_map(v->domain, iova, pa, size, + perm_to_iommu_flags(perm)); + + return r; +} + +static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) +{ + struct vhost_dev *dev = &v->vdev; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); + + if (ops->dma_map) + ops->dma_unmap(vdpa, iova, size); + else if (ops->set_map) + ops->set_map(vdpa, dev->iotlb); + else + iommu_unmap(v->domain, iova, size); +} + +static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, + struct vhost_iotlb_msg *msg) +{ + struct vhost_dev *dev = &v->vdev; + struct vhost_iotlb *iotlb = dev->iotlb; + struct page **page_list; + unsigned long list_size = PAGE_SIZE / sizeof(struct page *); + unsigned int gup_flags = FOLL_LONGTERM; + unsigned long npages, cur_base, map_pfn, last_pfn = 0; + unsigned long locked, lock_limit, pinned, i; + u64 iova = msg->iova; + int ret = 0; + + if (vhost_iotlb_itree_first(iotlb, msg->iova, + msg->iova + msg->size - 1)) + return -EEXIST; + + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + + if (msg->perm & VHOST_ACCESS_WO) + gup_flags |= FOLL_WRITE; + + npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; + if (!npages) + return -EINVAL; + + down_read(&dev->mm->mmap_sem); + + locked = atomic64_add_return(npages, &dev->mm->pinned_vm); + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (locked > lock_limit) { + ret = -ENOMEM; + goto out; + } + + cur_base = msg->uaddr & PAGE_MASK; + iova &= PAGE_MASK; + + while (npages) { + pinned = min_t(unsigned long, npages, list_size); + ret = pin_user_pages(cur_base, pinned, + gup_flags, page_list, NULL); + if (ret != pinned) + goto out; + + if (!last_pfn) + map_pfn = page_to_pfn(page_list[0]); + + for (i = 0; i < ret; i++) { + unsigned long this_pfn = page_to_pfn(page_list[i]); + u64 csize; + + if (last_pfn && (this_pfn != last_pfn + 1)) { + /* Pin a contiguous chunk of memory */ + csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; + if (vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm)) + goto out; + map_pfn = this_pfn; + iova += csize; + } + + last_pfn = this_pfn; + } + + cur_base += ret << PAGE_SHIFT; + npages -= ret; + } + + /* Pin the rest chunk */ + ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT, + map_pfn << PAGE_SHIFT, msg->perm); +out: + if (ret) { + vhost_vdpa_unmap(v, msg->iova, msg->size); + atomic64_sub(npages, &dev->mm->pinned_vm); + } + up_read(&dev->mm->mmap_sem); + free_page((unsigned long)page_list); + return ret; +} + +static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *msg) +{ + struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); + int r = 0; + + r = vhost_dev_check_owner(dev); + if (r) + return r; + + switch (msg->type) { + case VHOST_IOTLB_UPDATE: + r = vhost_vdpa_process_iotlb_update(v, msg); + break; + case VHOST_IOTLB_INVALIDATE: + vhost_vdpa_unmap(v, msg->iova, msg->size); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_vdpa *v = file->private_data; + struct vhost_dev *dev = &v->vdev; + + return vhost_chr_write_iter(dev, from); +} + +static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + struct device *dma_dev = vdpa_get_dma_dev(vdpa); + struct bus_type *bus; + int ret; + + /* Device want to do DMA by itself */ + if (ops->set_map || ops->dma_map) + return 0; + + bus = dma_dev->bus; + if (!bus) + return -EFAULT; + + if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) + return -ENOTSUPP; + + v->domain = iommu_domain_alloc(bus); + if (!v->domain) + return -EIO; + + ret = iommu_attach_device(v->domain, dma_dev); + if (ret) + goto err_attach; + + return 0; + +err_attach: + iommu_domain_free(v->domain); + return ret; +} + +static void vhost_vdpa_free_domain(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + struct device *dma_dev = vdpa_get_dma_dev(vdpa); + + if (v->domain) { + iommu_detach_device(v->domain, dma_dev); + iommu_domain_free(v->domain); + } + + v->domain = NULL; +} + +static int vhost_vdpa_open(struct inode *inode, struct file *filep) +{ + struct vhost_vdpa *v; + struct vhost_dev *dev; + struct vhost_virtqueue **vqs; + int nvqs, i, r, opened; + + v = container_of(inode->i_cdev, struct vhost_vdpa, cdev); + if (!v) + return -ENODEV; + + opened = atomic_cmpxchg(&v->opened, 0, 1); + if (opened) + return -EBUSY; + + nvqs = v->nvqs; + vhost_vdpa_reset(v); + + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); + if (!vqs) { + r = -ENOMEM; + goto err; + } + + dev = &v->vdev; + for (i = 0; i < nvqs; i++) { + vqs[i] = &v->vqs[i]; + vqs[i]->handle_kick = handle_vq_kick; + } + vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, + vhost_vdpa_process_iotlb_msg); + + dev->iotlb = vhost_iotlb_alloc(0, 0); + if (!dev->iotlb) { + r = -ENOMEM; + goto err_init_iotlb; + } + + r = vhost_vdpa_alloc_domain(v); + if (r) + goto err_init_iotlb; + + filep->private_data = v; + + return 0; + +err_init_iotlb: + vhost_dev_cleanup(&v->vdev); +err: + atomic_dec(&v->opened); + return r; +} + +static int vhost_vdpa_release(struct inode *inode, struct file *filep) +{ + struct vhost_vdpa *v = filep->private_data; + struct vhost_dev *d = &v->vdev; + + mutex_lock(&d->mutex); + filep->private_data = NULL; + vhost_vdpa_reset(v); + vhost_dev_stop(&v->vdev); + vhost_vdpa_iotlb_free(v); + vhost_vdpa_free_domain(v); + vhost_dev_cleanup(&v->vdev); + kfree(v->vdev.vqs); + mutex_unlock(&d->mutex); + + atomic_dec(&v->opened); + complete(&v->completion); + + return 0; +} + +static const struct file_operations vhost_vdpa_fops = { + .owner = THIS_MODULE, + .open = vhost_vdpa_open, + .release = vhost_vdpa_release, + .write_iter = vhost_vdpa_chr_write_iter, + .unlocked_ioctl = vhost_vdpa_unlocked_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static void vhost_vdpa_release_dev(struct device *device) +{ + struct vhost_vdpa *v = + container_of(device, struct vhost_vdpa, dev); + + ida_simple_remove(&vhost_vdpa_ida, v->minor); + kfree(v->vqs); + kfree(v); +} + +static int vhost_vdpa_probe(struct vdpa_device *vdpa) +{ + const struct vdpa_config_ops *ops = vdpa->config; + struct vhost_vdpa *v; + int minor, nvqs = VHOST_VDPA_VQ_MAX; + int r; + + /* Currently, we only accept the network devices. */ + if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) + return -ENOTSUPP; + + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!v) + return -ENOMEM; + + minor = ida_simple_get(&vhost_vdpa_ida, 0, + VHOST_VDPA_DEV_MAX, GFP_KERNEL); + if (minor < 0) { + kfree(v); + return minor; + } + + atomic_set(&v->opened, 0); + v->minor = minor; + v->vdpa = vdpa; + v->nvqs = nvqs; + v->virtio_id = ops->get_device_id(vdpa); + + device_initialize(&v->dev); + v->dev.release = vhost_vdpa_release_dev; + v->dev.parent = &vdpa->dev; + v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor); + v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue), + GFP_KERNEL); + if (!v->vqs) { + r = -ENOMEM; + goto err; + } + + r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor); + if (r) + goto err; + + cdev_init(&v->cdev, &vhost_vdpa_fops); + v->cdev.owner = THIS_MODULE; + + r = cdev_device_add(&v->cdev, &v->dev); + if (r) + goto err; + + init_completion(&v->completion); + vdpa_set_drvdata(vdpa, v); + + return 0; + +err: + put_device(&v->dev); + return r; +} + +static void vhost_vdpa_remove(struct vdpa_device *vdpa) +{ + struct vhost_vdpa *v = vdpa_get_drvdata(vdpa); + int opened; + + cdev_device_del(&v->cdev, &v->dev); + + do { + opened = atomic_cmpxchg(&v->opened, 0, 1); + if (!opened) + break; + wait_for_completion(&v->completion); + } while (1); + + put_device(&v->dev); +} + +static struct vdpa_driver vhost_vdpa_driver = { + .driver = { + .name = "vhost_vdpa", + }, + .probe = vhost_vdpa_probe, + .remove = vhost_vdpa_remove, +}; + +static int __init vhost_vdpa_init(void) +{ + int r; + + r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX, + "vhost-vdpa"); + if (r) + goto err_alloc_chrdev; + + r = vdpa_register_driver(&vhost_vdpa_driver); + if (r) + goto err_vdpa_register_driver; + + return 0; + +err_vdpa_register_driver: + unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); +err_alloc_chrdev: + return r; +} +module_init(vhost_vdpa_init); + +static void __exit vhost_vdpa_exit(void) +{ + vdpa_unregister_driver(&vhost_vdpa_driver); + unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); +} +module_exit(vhost_vdpa_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("vDPA-based vhost backend for virtio"); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 40d028eed645..9fe72e4b1373 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -116,4 +116,28 @@ #define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64) #define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int) +/* VHOST_VDPA specific defines */ + +/* Get the device id. The device ids follow the same definition of + * the device id defined in virtio-spec. + */ +#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32) +/* Get and set the status. The status bits follow the same definition + * of the device status defined in virtio-spec. + */ +#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8) +#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8) +/* Get and set the device config. The device config follows the same + * definition of the device config defined in virtio-spec. + */ +#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \ + struct vhost_vdpa_config) +#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \ + struct vhost_vdpa_config) +/* Enable/disable the ring. */ +#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \ + struct vhost_vring_state) +/* Get the max ring size. */ +#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16) + #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index c907290ff065..669457ce5c48 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -119,6 +119,14 @@ struct vhost_scsi_target { unsigned short reserved; }; +/* VHOST_VDPA specific definitions */ + +struct vhost_vdpa_config { + __u32 off; + __u32 len; + __u8 buf[0]; +}; + /* Feature bits */ /* Log all write descriptors. Can be changed while device is active. */ #define VHOST_F_LOG_ALL 26 -- cgit v1.2.3