/* Intel PRO/1000 Linux driver * Copyright (c) 2017, Linaro Limited * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * The full GNU General Public License is included in this distribution in * the file called "COPYING". */ #include #include #include #include #include #include #include "e1000.h" #include "vfio-net.h" static struct net_device *netmdev_get_netdev(struct mdev_device *mdev); /* Common vfio-net code: move to core */ #if 1 struct iovamap { u64 iova; void *vaddr; struct device *dev; u32 size; /* maximum of 32MB */ enum dma_data_direction direction; /* DMA_FROM_DEVICE... */ }; typedef struct netmdev { union { char page0[4096]; struct { struct net_device *netdev; /* FIXME USE A LINKED LIST */ int mappings_count; struct iovamap mappings[128]; /* 3.5KB */ }; }; union { /* shadow features & statistics page */ char page1[4096]; struct { netdev_features_t features; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; netdev_features_t gso_partial_features; struct net_device_stats stats; atomic_long_t rx_dropped; atomic_long_t tx_dropped; atomic_long_t rx_nohandler; }; }; } netmdev; #define VFIO_NET_OFFSET_SHIFT 40 #define VFIO_NET_OFFSET_TO_INDEX(off) (off >> VFIO_NET_OFFSET_SHIFT) #define VFIO_NET_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_NET_OFFSET_SHIFT) #define VFIO_NET_OFFSET_MASK (((u64)(1) << VFIO_NET_OFFSET_SHIFT) - 1) #endif /** * e1000e_vfio_net_open - decouple from network stack * @mdev: mediated device to open * * Decouple network device from the stack and make it available to userspace * packet reception and transmission. */ static int e1000e_vfio_net_open(struct mdev_device *mdev) { struct net_device *netdev = netmdev_get_netdev(mdev); struct e1000_adapter *adapter = netdev_priv(netdev); dev_hold(adapter->netdev); set_bit(__E1000_VFIO_NET, &adapter->state); if (netif_running(adapter->netdev)) e1000e_reinit_locked(adapter); else e1000e_reset(adapter); return 0; } /** * e1000e_vfio_net_release - cleanup attach to network stack * @mdev: mediated device to release * * Cleanup all mediated device resources and restore network stack * connection. */ static void e1000e_vfio_net_release(struct mdev_device *mdev) { struct net_device *netdev = netmdev_get_netdev(mdev); struct e1000_adapter *adapter = netdev_priv(netdev); clear_bit(__E1000_VFIO_NET, &adapter->state); if (netif_running(adapter->netdev)) e1000e_reinit_locked(adapter); else e1000e_reset(adapter); dev_put(adapter->netdev); } /** * e1000e_vfio_net_device_info_get - query vfio device information * @adapter: board private structure * @info: vfio device description * * Retrieves information about the device. Fills in provided * struct vfio_device_info. */ static void e1000e_vfio_net_device_info_get(struct e1000_adapter *adapter, struct vfio_device_info *info) { info->flags = VFIO_DEVICE_FLAGS_PCI | VFIO_DEVICE_FLAGS_RESET; info->num_regions = VFIO_PCI_NUM_REGIONS + 3; info->num_irqs = 1; } /** * e1000e_vfio_net_region_info_get - query vfio region information * @adapter: board private structure * @info: vfio region description * * Retrieves information about the region. The region to query is specified in * @info->index. Fills in provided struct vfio_region_info. */ static int e1000e_vfio_net_region_info_get(struct e1000_adapter *adapter, struct vfio_region_info *info) { int ret = 0; switch (info->index) { case VFIO_PCI_BAR0_REGION_INDEX: /* PCI resource 0 */ info->offset = VFIO_NET_INDEX_TO_OFFSET(info->index); info->size = pci_resource_len(adapter->pdev, info->index); info->flags = VFIO_REGION_INFO_FLAG_MMAP; break; case VFIO_PCI_NUM_REGIONS + 1: /* RX descriptor rings */ info->offset = VFIO_NET_INDEX_TO_OFFSET(info->index); info->size = adapter->rx_ring[0].size; info->flags = VFIO_REGION_INFO_FLAG_MMAP; break; case VFIO_PCI_NUM_REGIONS + 2: /* TX descriptor ring */ info->offset = VFIO_NET_INDEX_TO_OFFSET(info->index); info->size = adapter->tx_ring[0].size; info->flags = VFIO_REGION_INFO_FLAG_MMAP; break; default: ret = -EINVAL; break; } return ret; } /** * e1000e_vfio_net_irq_info_get - query vfio irq information * @adapter: board private structure * @info: vfio irq description * * Retrieves information about the irq. Fills in provided * struct vfio_irq_info. */ static void e1000e_vfio_net_irq_info_get(struct e1000_adapter *adapter, struct vfio_irq_info *info) { info->count = 1; info->flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED; } /** * e1000e_vfio_net_ioctl - vfio-net ioctl * @mdev: * @cmd: * @arg: * * Implement device and region queries, DMA memory mapping and device reset **/ static long e1000e_vfio_net_ioctl(struct mdev_device *mdev, unsigned int cmd, unsigned long arg) { struct net_device *netdev = netmdev_get_netdev(mdev); struct e1000_adapter *adapter = netdev_priv(netdev); struct vfio_device_info device_info; struct vfio_region_info region_info; struct vfio_irq_info irq_info; unsigned long minsz; int ret; switch (cmd) { case VFIO_DEVICE_GET_INFO: minsz = offsetofend(struct vfio_device_info, num_irqs); if (copy_from_user(&device_info, (void __user *)arg, minsz)) return -EFAULT; if (device_info.argsz < minsz) return -EINVAL; e1000e_vfio_net_device_info_get(adapter, &device_info); if (copy_to_user((void __user *)arg, &device_info, minsz)) return -EFAULT; return 0; case VFIO_DEVICE_GET_REGION_INFO: minsz = offsetofend(struct vfio_region_info, offset); if (copy_from_user(®ion_info, (void __user *)arg, minsz)) return -EFAULT; if (region_info.argsz < minsz) return -EINVAL; ret = e1000e_vfio_net_region_info_get(adapter, ®ion_info); if (ret < 0) return ret; if (copy_to_user((void __user *)arg, ®ion_info, minsz)) return -EFAULT; return 0; case VFIO_DEVICE_GET_IRQ_INFO: minsz = offsetofend(struct vfio_irq_info, count); if (copy_from_user(&irq_info, (void __user *)arg, minsz)) return -EFAULT; if (irq_info.argsz < minsz || irq_info.index != 0) return -EINVAL; e1000e_vfio_net_irq_info_get(adapter, &irq_info); return copy_to_user((void __user *)arg, &irq_info, minsz) ? -EFAULT : 0; case VFIO_IOMMU_MAP_DMA: /* Common vfio-net code: move to core */ #if 1 { struct netmdev *netmdev = mdev_get_drvdata(mdev); struct vfio_iommu_type1_dma_map map; struct vm_area_struct *vma; void *data; struct device *parent_dev; int node; dma_addr_t mapping; int ret = -EINVAL; /* allocate DMA area and map it where the userland asks * userland need to mmap an area WITHOUT allocating pages: * mmap(vaddr,size, PROT_READ | PROT_WRITE, MAP_SHARED | * MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, -1, 0 * MAP_NORESERVE ensures only VA space is booked, no pages are * mapped * the mapping must be the entire area, not partial on * the vma */ if (netmdev->mappings_count >= 128) return -EFAULT; minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); if (copy_from_user(&map, (void __user *)arg, minsz)) { ret = -EFAULT; goto out; } if (map.argsz < minsz) goto out; printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: find_vma(%llx)\n", map.vaddr); /* * locates the containing vma for the required map.vaddr * the vma must point to the entire zone allocated by mmap in * userland */ vma = find_vma(current->mm, map.vaddr); if (!vma) return -EFAULT; if (map.vaddr >= vma->vm_end) return -EFAULT; printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: found vma(%llx) -> start=%lx end=%lx pg_off=%lx\n", map.vaddr, vma->vm_start, vma->vm_end, vma->vm_pgoff); /* the iova will be returned as part of the ioctl to the userland */ //parent_dev = &tp->pci_dev->dev; parent_dev = mdev_parent_dev(mdev); node = netdev->dev.parent ? dev_to_node(netdev->dev.parent) : -1; data = kmalloc_node(map.size, GFP_KERNEL, node); if (!data) /* return ret? */ return -ENOMEM; printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: about to dma_map_single(%p, %p, %lld, DMA_FROM_DEVICE)\n", parent_dev, data, map.size); mapping = dma_map_single(parent_dev, data, map.size, DMA_FROM_DEVICE); if (unlikely(dma_mapping_error(parent_dev, mapping))) { if (net_ratelimit()) printk(KERN_ERR"Failed to dma_map_single buffer for userland!\n"); kfree(data); goto out; } map.iova = mapping; ret = io_remap_pfn_range(vma, map.vaddr, virt_to_phys(data) >> PAGE_SHIFT, map.size, vma->vm_page_prot); printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: io_remap_pfn_range %llx -> physmem <- @%llx, %lld:%d\n", map.vaddr, map.iova, map.size, ret); if (ret != 0) { dma_unmap_single(parent_dev, mapping, map.size, DMA_FROM_DEVICE); kfree(data); printk(KERN_ERR"VFIO_IOMMU_MAP_DMA: io_remap_pfn_range failed\n"); return -EFAULT; } printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: recording the mapping %d\n", netmdev->mappings_count); netmdev->mappings[netmdev->mappings_count].dev = parent_dev; netmdev->mappings[netmdev->mappings_count].vaddr = data; netmdev->mappings[netmdev->mappings_count].iova = mapping; netmdev->mappings[netmdev->mappings_count].size = map.size; netmdev->mappings_count++; printk(KERN_INFO"VFIO_IOMMU_MAP_DMA: preparing response back to user\n"); if (copy_to_user((void __user *)arg, &map, minsz)) return -EFAULT; ret = 0; out: return ret; } #endif return -EOPNOTSUPP; case VFIO_DEVICE_RESET: if (netif_running(adapter->netdev)) e1000e_reinit_locked(adapter); else e1000e_reset(adapter); return 0; default: return -EOPNOTSUPP; } } static int e1000e_vfio_net_mmap(struct mdev_device *mdev, struct vm_area_struct *vma) { struct net_device *netdev = netmdev_get_netdev(mdev); struct e1000_adapter *adapter = netdev_priv(netdev); u64 phys_len, req_len, pgoff, req_start; unsigned long phys_pfn; unsigned int index; if (vma->vm_end < vma->vm_start) return -EINVAL; if ((vma->vm_flags & VM_SHARED) == 0) return -EINVAL; index = vma->vm_pgoff >> (VFIO_NET_OFFSET_SHIFT - PAGE_SHIFT); switch (index) { case VFIO_PCI_BAR0_REGION_INDEX: /* PCI resource 0 */ phys_pfn = pci_resource_start(adapter->pdev, index) >> PAGE_SHIFT; phys_len = pci_resource_len(adapter->pdev, index); break; case VFIO_PCI_NUM_REGIONS + 1: /* RX descriptor rings */ phys_pfn = (u64)virt_to_phys(adapter->rx_ring[0].desc) >> PAGE_SHIFT; phys_len = adapter->rx_ring[0].size; break; case VFIO_PCI_NUM_REGIONS + 2: /* TX descriptor rings */ phys_pfn = (u64)virt_to_phys(adapter->tx_ring[0].desc) >> PAGE_SHIFT; phys_len = adapter->tx_ring[0].size; break; default: return -EINVAL; } req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_NET_OFFSET_SHIFT - PAGE_SHIFT)) - 1); req_start = pgoff << PAGE_SHIFT; if (req_start + req_len > phys_len) return -EINVAL; vma->vm_private_data = NULL; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_pgoff = phys_pfn + pgoff; return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, req_len, vma->vm_page_prot); } /* Common vfio-net code: move to core */ #if 1 /* SYSFS structure for the controlling device */ static ssize_t available_instances_show(struct kobject *kobj, struct device *dev, char *buf) { return scnprintf(buf, PAGE_SIZE, "%d\n", 1); } static MDEV_TYPE_ATTR_RO(available_instances); static ssize_t device_api_show(struct kobject *kobj, struct device *dev, char *buf) { return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); } static MDEV_TYPE_ATTR_RO(device_api); static struct attribute *sysfs_vfnetdev_attributes[] = { &mdev_type_attr_available_instances.attr, &mdev_type_attr_device_api.attr, NULL, }; static struct attribute_group sysfs_vfnetdev_type = { .name = "vfnetdev", .attrs = sysfs_vfnetdev_attributes, }; /* Only 1 supported for now */ static struct attribute_group *sysfs_type_list[] = { &sysfs_vfnetdev_type, NULL }; /* * libraries */ static struct net_device *netmdev_get_netdev(struct mdev_device *mdev) { struct netmdev *netmdev; netmdev = mdev_get_drvdata(mdev); if (!netmdev) return NULL; return netmdev->netdev; } /* * SYSFS structure for created mdevices */ static ssize_t netdev_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mdev_device *mdev; struct net_device *netdev; mdev = mdev_from_dev(dev); if (!mdev) return scnprintf(buf, PAGE_SIZE, "mdev not found\n"); netdev = netmdev_get_netdev(mdev); if (!netdev) return scnprintf(buf, PAGE_SIZE, "ndev-mdev not found\n"); return scnprintf(buf, PAGE_SIZE, "%.16s\n", netdev->name); } static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mdev_device *mdev; struct net_device *port; struct netmdev *netmdev; char name[IFNAMSIZ+1]; if (count < 2) return -EINVAL; mdev = mdev_from_dev(dev); if (!mdev) return -ENODEV; netmdev = mdev_get_drvdata(mdev); if (netmdev) return -ENODEV; netmdev = kzalloc(sizeof(*netmdev), GFP_KERNEL); if (!netmdev) return -ENOMEM; mdev_set_drvdata(mdev, netmdev); if (count > IFNAMSIZ) return -ENODEV; memset(name, 0, sizeof(name)); scnprintf(name, IFNAMSIZ + 1, "%.*s", (int)count - 1, buf); port = dev_get_by_name(&init_net, name); if (!port) return -ENODEV; /* FIXME find a way to check if this is the parent device */ //if (&port->dev != mdev_parent_dev(mdev)) return -1; netmdev->netdev = port; return count; } static DEVICE_ATTR_RW(netdev); static struct attribute *sysfs_mdev_vfnetdev_attributes[] = { &dev_attr_netdev.attr, NULL, }; static struct attribute_group sysfs_mdev_vfnetdev_group = { .name = "vfnetdev", .attrs = sysfs_mdev_vfnetdev_attributes, }; static const struct attribute_group *sysfs_mdev_groups[] = { &sysfs_mdev_vfnetdev_group, NULL, }; static int e1000_vfio_net_create(struct kobject *kobj, struct mdev_device *mdev) { return 0; } static int e1000_vfio_net_remove(struct mdev_device *mdev) { struct netmdev *netmdev = mdev_get_drvdata(mdev); struct net_device *port; printk(KERN_INFO "%s %d\n", __func__, __LINE__); port = netmdev_get_netdev(mdev); dev_put(port); kfree(netmdev); mdev_set_drvdata(mdev, NULL); return 0; } #endif const struct mdev_parent_ops e1000_vfio_net_ops = { .supported_type_groups = sysfs_type_list, .mdev_attr_groups = sysfs_mdev_groups, .create = e1000_vfio_net_create, .remove = e1000_vfio_net_remove, .open = e1000e_vfio_net_open, .release = e1000e_vfio_net_release, .read = NULL, .write = NULL, .mmap = e1000e_vfio_net_mmap, .ioctl = e1000e_vfio_net_ioctl, };