// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) Microsoft Corporation. * * Author: * Jake Oshins <[email protected]> * * This driver acts as a paravirtual front-end for PCI Express root buses. * When a PCI Express function (either an entire device or an SR-IOV * Virtual Function) is being passed through to the VM, this driver exposes * a new bus to the guest VM. This is modeled as a root PCI bus because * no bridges are being exposed to the VM. In fact, with a "Generation 2" * VM within Hyper-V, there may seem to be no PCI bus at all in the VM * until a device as been exposed using this driver. * * Each root PCI bus has its own PCI domain, which is called "Segment" in * the PCI Firmware Specifications. Thus while each device passed through * to the VM using this front-end will appear at "device 0", the domain will * be unique. Typically, each bus will have one PCI function on it, though * this driver does support more than one. * * In order to map the interrupts from the device through to the guest VM, * this driver also implements an IRQ Domain, which handles interrupts (either * MSI or MSI-X) associated with the functions on the bus. As interrupts are * set up, torn down, or reaffined, this driver communicates with the * underlying hypervisor to adjust the mappings in the I/O MMU so that each * interrupt will be delivered to the correct virtual processor at the right * vector. This driver does not support level-triggered (line-based) * interrupts, and will report that the Interrupt Line register in the * function's configuration space is zero. * * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V * facilities. For instance, the configuration space of a function exposed * by Hyper-V is mapped into a single page of memory space, and the * read and write handlers for config space must be aware of this mechanism. * Similarly, device setup and teardown involves messages sent to and from * the PCI back-end driver in Hyper-V. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/pci-ecam.h> #include <linux/delay.h> #include <linux/semaphore.h> #include <linux/irq.h> #include <linux/msi.h> #include <linux/hyperv.h> #include <linux/refcount.h> #include <linux/irqdomain.h> #include <linux/acpi.h> #include <linux/sizes.h> #include <asm/mshyperv.h> /* * Protocol versions. The low word is the minor version, the high word the * major version. */ #define PCI_MAKE_VERSION(major, minor) … #define PCI_MAJOR_VERSION(version) … #define PCI_MINOR_VERSION(version) … enum pci_protocol_version_t { … }; #define CPU_AFFINITY_ALL … /* * Supported protocol versions in the order of probing - highest go * first. */ static enum pci_protocol_version_t pci_protocol_versions[] = …; #define PCI_CONFIG_MMIO_LENGTH … #define CFG_PAGE_OFFSET … #define CFG_PAGE_SIZE … #define MAX_SUPPORTED_MSI_MESSAGES … #define STATUS_REVISION_MISMATCH … /* space for 32bit serial number as string */ #define SLOT_NAME_SIZE … /* * Size of requestor for VMbus; the value is based on the observation * that having more than one request outstanding is 'rare', and so 64 * should be generous in ensuring that we don't ever run out. */ #define HV_PCI_RQSTOR_SIZE … /* * Message Types */ enum pci_message_type { … }; /* * Structures defining the virtual PCI Express protocol. */ pci_version __packed; /* * Function numbers are 8-bits wide on Express, as interpreted through ARI, * which is all this driver does. This representation is the one used in * Windows, which is what is expected when sending this back and forth with * the Hyper-V parent partition. */ win_slot_encoding __packed; /* * Pretty much as defined in the PCI Specifications. */ struct pci_function_description { … } __packed; enum pci_device_description_flags { … }; struct pci_function_description2 { … } __packed; /** * struct hv_msi_desc * @vector: IDT entry * @delivery_mode: As defined in Intel's Programmer's * Reference Manual, Volume 3, Chapter 8. * @vector_count: Number of contiguous entries in the * Interrupt Descriptor Table that are * occupied by this Message-Signaled * Interrupt. For "MSI", as first defined * in PCI 2.2, this can be between 1 and * 32. For "MSI-X," as first defined in PCI * 3.0, this must be 1, as each MSI-X table * entry would have its own descriptor. * @reserved: Empty space * @cpu_mask: All the target virtual processors. */ struct hv_msi_desc { … } __packed; /** * struct hv_msi_desc2 - 1.2 version of hv_msi_desc * @vector: IDT entry * @delivery_mode: As defined in Intel's Programmer's * Reference Manual, Volume 3, Chapter 8. * @vector_count: Number of contiguous entries in the * Interrupt Descriptor Table that are * occupied by this Message-Signaled * Interrupt. For "MSI", as first defined * in PCI 2.2, this can be between 1 and * 32. For "MSI-X," as first defined in PCI * 3.0, this must be 1, as each MSI-X table * entry would have its own descriptor. * @processor_count: number of bits enabled in array. * @processor_array: All the target virtual processors. */ struct hv_msi_desc2 { … } __packed; /* * struct hv_msi_desc3 - 1.3 version of hv_msi_desc * Everything is the same as in 'hv_msi_desc2' except that the size of the * 'vector' field is larger to support bigger vector values. For ex: LPI * vectors on ARM. */ struct hv_msi_desc3 { … } __packed; /** * struct tran_int_desc * @reserved: unused, padding * @vector_count: same as in hv_msi_desc * @data: This is the "data payload" value that is * written by the device when it generates * a message-signaled interrupt, either MSI * or MSI-X. * @address: This is the address to which the data * payload is written on interrupt * generation. */ struct tran_int_desc { … } __packed; /* * A generic message format for virtual PCI. * Specific message formats are defined later in the file. */ struct pci_message { … } __packed; struct pci_child_message { … } __packed; struct pci_incoming_message { … } __packed; struct pci_response { … } __packed; struct pci_packet { … }; /* * Specific message types supporting the PCI protocol. */ /* * Version negotiation message. Sent from the guest to the host. * The guest is free to try different versions until the host * accepts the version. * * pci_version: The protocol version requested. * is_last_attempt: If TRUE, this is the last version guest will request. * reservedz: Reserved field, set to zero. */ struct pci_version_request { … } __packed; /* * Bus D0 Entry. This is sent from the guest to the host when the virtual * bus (PCI Express port) is ready for action. */ struct pci_bus_d0_entry { … } __packed; struct pci_bus_relations { … } __packed; struct pci_bus_relations2 { … } __packed; struct pci_q_res_req_response { … } __packed; struct pci_set_power { … } __packed; struct pci_set_power_response { … } __packed; struct pci_resources_assigned { … } __packed; struct pci_resources_assigned2 { … } __packed; struct pci_create_interrupt { … } __packed; struct pci_create_int_response { … } __packed; struct pci_create_interrupt2 { … } __packed; struct pci_create_interrupt3 { … } __packed; struct pci_delete_interrupt { … } __packed; /* * Note: the VM must pass a valid block id, wslot and bytes_requested. */ struct pci_read_block { … } __packed; struct pci_read_block_response { … } __packed; /* * Note: the VM must pass a valid block id, wslot and byte_count. */ struct pci_write_block { … } __packed; struct pci_dev_inval_block { … } __packed; struct pci_dev_incoming { … } __packed; struct pci_eject_response { … } __packed; static int pci_ring_size = …; /* * Driver specific state. */ enum hv_pcibus_state { … }; struct hv_pcibus_device { … }; /* * Tracks "Device Relations" messages from the host, which must be both * processed in order and deferred so that they don't run in the context * of the incoming packet callback. */ struct hv_dr_work { … }; struct hv_pcidev_description { … }; struct hv_dr_state { … }; struct hv_pci_dev { … }; struct hv_pci_compl { … }; static void hv_pci_onchannelcallback(void *context); #ifdef CONFIG_X86 #define DELIVERY_MODE … #define FLOW_HANDLER … #define FLOW_NAME … static int hv_pci_irqchip_init(void) { … } static struct irq_domain *hv_pci_get_root_domain(void) { … } static unsigned int hv_msi_get_int_vector(struct irq_data *data) { … } #define hv_msi_prepare … /** * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current * affinity. * @data: Describes the IRQ * * Build new a destination for the MSI and make a hypercall to * update the Interrupt Redirection Table. "Device Logical ID" * is built out of this PCI bus's instance GUID and the function * number of the device. */ static void hv_arch_irq_unmask(struct irq_data *data) { … } #elif defined(CONFIG_ARM64) /* * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit * of room at the start to allow for SPIs to be specified through ACPI and * starting with a power of two to satisfy power of 2 multi-MSI requirement. */ #define HV_PCI_MSI_SPI_START … #define HV_PCI_MSI_SPI_NR … #define DELIVERY_MODE … #define FLOW_HANDLER … #define FLOW_NAME … #define hv_msi_prepare … struct hv_pci_chip_data { DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR); struct mutex map_lock; }; /* Hyper-V vPCI MSI GIC IRQ domain */ static struct irq_domain *hv_msi_gic_irq_domain; /* Hyper-V PCI MSI IRQ chip */ static struct irq_chip hv_arm64_msi_irq_chip = { .name = "MSI", .irq_set_affinity = irq_chip_set_affinity_parent, .irq_eoi = irq_chip_eoi_parent, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent }; static unsigned int hv_msi_get_int_vector(struct irq_data *irqd) { return irqd->parent_data->hwirq; } /* * @nr_bm_irqs: Indicates the number of IRQs that were allocated from * the bitmap. * @nr_dom_irqs: Indicates the number of IRQs that were allocated from * the parent domain. */ static void hv_pci_vec_irq_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_bm_irqs, unsigned int nr_dom_irqs) { struct hv_pci_chip_data *chip_data = domain->host_data; struct irq_data *d = irq_domain_get_irq_data(domain, virq); int first = d->hwirq - HV_PCI_MSI_SPI_START; int i; mutex_lock(&chip_data->map_lock); bitmap_release_region(chip_data->spi_map, first, get_count_order(nr_bm_irqs)); mutex_unlock(&chip_data->map_lock); for (i = 0; i < nr_dom_irqs; i++) { if (i) d = irq_domain_get_irq_data(domain, virq + i); irq_domain_reset_irq_data(d); } irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs); } static void hv_pci_vec_irq_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs); } static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain, unsigned int nr_irqs, irq_hw_number_t *hwirq) { struct hv_pci_chip_data *chip_data = domain->host_data; int index; /* Find and allocate region from the SPI bitmap */ mutex_lock(&chip_data->map_lock); index = bitmap_find_free_region(chip_data->spi_map, HV_PCI_MSI_SPI_NR, get_count_order(nr_irqs)); mutex_unlock(&chip_data->map_lock); if (index < 0) return -ENOSPC; *hwirq = index + HV_PCI_MSI_SPI_START; return 0; } static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_fwspec fwspec; struct irq_data *d; int ret; fwspec.fwnode = domain->parent->fwnode; fwspec.param_count = 2; fwspec.param[0] = hwirq; fwspec.param[1] = IRQ_TYPE_EDGE_RISING; ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); if (ret) return ret; /* * Since the interrupt specifier is not coming from ACPI or DT, the * trigger type will need to be set explicitly. Otherwise, it will be * set to whatever is in the GIC configuration. */ d = irq_domain_get_irq_data(domain->parent, virq); return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); } static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *args) { irq_hw_number_t hwirq; unsigned int i; int ret; ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq); if (ret) return ret; for (i = 0; i < nr_irqs; i++) { ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, hwirq + i); if (ret) { hv_pci_vec_irq_free(domain, virq, nr_irqs, i); return ret; } irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &hv_arm64_msi_irq_chip, domain->host_data); pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i); } return 0; } /* * Pick the first cpu as the irq affinity that can be temporarily used for * composing MSI from the hypervisor. GIC will eventually set the right * affinity for the irq and the 'unmask' will retarget the interrupt to that * cpu. */ static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain, struct irq_data *irqd, bool reserve) { int cpu = cpumask_first(cpu_present_mask); irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); return 0; } static const struct irq_domain_ops hv_pci_domain_ops = { .alloc = hv_pci_vec_irq_domain_alloc, .free = hv_pci_vec_irq_domain_free, .activate = hv_pci_vec_irq_domain_activate, }; static int hv_pci_irqchip_init(void) { static struct hv_pci_chip_data *chip_data; struct fwnode_handle *fn = NULL; int ret = -ENOMEM; chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); if (!chip_data) return ret; mutex_init(&chip_data->map_lock); fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64"); if (!fn) goto free_chip; /* * IRQ domain once enabled, should not be removed since there is no * way to ensure that all the corresponding devices are also gone and * no interrupts will be generated. */ hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, fn, &hv_pci_domain_ops, chip_data); if (!hv_msi_gic_irq_domain) { pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); goto free_chip; } return 0; free_chip: kfree(chip_data); if (fn) irq_domain_free_fwnode(fn); return ret; } static struct irq_domain *hv_pci_get_root_domain(void) { return hv_msi_gic_irq_domain; } /* * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD * registers which Hyper-V already supports, so no hypercall needed. */ static void hv_arch_irq_unmask(struct irq_data *data) { } #endif /* CONFIG_ARM64 */ /** * hv_pci_generic_compl() - Invoked for a completion packet * @context: Set up by the sender of the packet. * @resp: The response packet * @resp_packet_size: Size in bytes of the packet * * This function is used to trigger an event and report status * for any message for which the completion packet contains a * status and nothing else. */ static void hv_pci_generic_compl(void *context, struct pci_response *resp, int resp_packet_size) { … } static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, u32 wslot); static void get_pcichild(struct hv_pci_dev *hpdev) { … } static void put_pcichild(struct hv_pci_dev *hpdev) { … } /* * There is no good way to get notified from vmbus_onoffer_rescind(), * so let's use polling here, since this is not a hot path. */ static int wait_for_response(struct hv_device *hdev, struct completion *comp) { … } /** * devfn_to_wslot() - Convert from Linux PCI slot to Windows * @devfn: The Linux representation of PCI slot * * Windows uses a slightly different representation of PCI slot. * * Return: The Windows representation */ static u32 devfn_to_wslot(int devfn) { … } /** * wslot_to_devfn() - Convert from Windows PCI slot to Linux * @wslot: The Windows representation of PCI slot * * Windows uses a slightly different representation of PCI slot. * * Return: The Linux representation */ static int wslot_to_devfn(u32 wslot) { … } static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val) { … } static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val) { … } /* * PCI Configuration Space for these root PCI buses is implemented as a pair * of pages in memory-mapped I/O space. Writing to the first page chooses * the PCI function being written or read. Once the first page has been * written to, the following page maps in the entire configuration space of * the function. */ /** * _hv_pcifront_read_config() - Internal PCI config read * @hpdev: The PCI driver's representation of the device * @where: Offset within config space * @size: Size of the transfer * @val: Pointer to the buffer receiving the data */ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size, u32 *val) { … } static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) { … } /** * _hv_pcifront_write_config() - Internal PCI config write * @hpdev: The PCI driver's representation of the device * @where: Offset within config space * @size: Size of the transfer * @val: The data being transferred */ static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size, u32 val) { … } /** * hv_pcifront_read_config() - Read configuration space * @bus: PCI Bus structure * @devfn: Device/function * @where: Offset from base * @size: Byte/word/dword * @val: Value to be read * * Return: PCIBIOS_SUCCESSFUL on success * PCIBIOS_DEVICE_NOT_FOUND on failure */ static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { … } /** * hv_pcifront_write_config() - Write configuration space * @bus: PCI Bus structure * @devfn: Device/function * @where: Offset from base * @size: Byte/word/dword * @val: Value to be written to device * * Return: PCIBIOS_SUCCESSFUL on success * PCIBIOS_DEVICE_NOT_FOUND on failure */ static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { … } /* PCIe operations */ static struct pci_ops hv_pcifront_ops = …; /* * Paravirtual backchannel * * Hyper-V SR-IOV provides a backchannel mechanism in software for * communication between a VF driver and a PF driver. These * "configuration blocks" are similar in concept to PCI configuration space, * but instead of doing reads and writes in 32-bit chunks through a very slow * path, packets of up to 128 bytes can be sent or received asynchronously. * * Nearly every SR-IOV device contains just such a communications channel in * hardware, so using this one in software is usually optional. Using the * software channel, however, allows driver implementers to leverage software * tools that fuzz the communications channel looking for vulnerabilities. * * The usage model for these packets puts the responsibility for reading or * writing on the VF driver. The VF driver sends a read or a write packet, * indicating which "block" is being referred to by number. * * If the PF driver wishes to initiate communication, it can "invalidate" one or * more of the first 64 blocks. This invalidation is delivered via a callback * supplied by the VF driver by this driver. * * No protocol is implied, except that supplied by the PF and VF drivers. */ struct hv_read_config_compl { … }; /** * hv_pci_read_config_compl() - Invoked when a response packet * for a read config block operation arrives. * @context: Identifies the read config operation * @resp: The response packet itself * @resp_packet_size: Size in bytes of the response packet */ static void hv_pci_read_config_compl(void *context, struct pci_response *resp, int resp_packet_size) { … } /** * hv_read_config_block() - Sends a read config block request to * the back-end driver running in the Hyper-V parent partition. * @pdev: The PCI driver's representation for this device. * @buf: Buffer into which the config block will be copied. * @len: Size in bytes of buf. * @block_id: Identifies the config block which has been requested. * @bytes_returned: Size which came back from the back-end driver. * * Return: 0 on success, -errno on failure */ static int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len, unsigned int block_id, unsigned int *bytes_returned) { … } /** * hv_pci_write_config_compl() - Invoked when a response packet for a write * config block operation arrives. * @context: Identifies the write config operation * @resp: The response packet itself * @resp_packet_size: Size in bytes of the response packet */ static void hv_pci_write_config_compl(void *context, struct pci_response *resp, int resp_packet_size) { … } /** * hv_write_config_block() - Sends a write config block request to the * back-end driver running in the Hyper-V parent partition. * @pdev: The PCI driver's representation for this device. * @buf: Buffer from which the config block will be copied. * @len: Size in bytes of buf. * @block_id: Identifies the config block which is being written. * * Return: 0 on success, -errno on failure */ static int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len, unsigned int block_id) { … } /** * hv_register_block_invalidate() - Invoked when a config block invalidation * arrives from the back-end driver. * @pdev: The PCI driver's representation for this device. * @context: Identifies the device. * @block_invalidate: Identifies all of the blocks being invalidated. * * Return: 0 on success, -errno on failure */ static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, void (*block_invalidate)(void *context, u64 block_mask)) { … } /* Interrupt management hooks */ static void hv_int_desc_free(struct hv_pci_dev *hpdev, struct tran_int_desc *int_desc) { … } /** * hv_msi_free() - Free the MSI. * @domain: The interrupt domain pointer * @info: Extra MSI-related context * @irq: Identifies the IRQ. * * The Hyper-V parent partition and hypervisor are tracking the * messages that are in use, keeping the interrupt redirection * table up to date. This callback sends a message that frees * the IRT entry and related tracking nonsense. */ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, unsigned int irq) { … } static void hv_irq_mask(struct irq_data *data) { … } static void hv_irq_unmask(struct irq_data *data) { … } struct compose_comp_ctxt { … }; static void hv_pci_compose_compl(void *context, struct pci_response *resp, int resp_packet_size) { … } static u32 hv_compose_msi_req_v1( struct pci_create_interrupt *int_pkt, u32 slot, u8 vector, u16 vector_count) { … } /* * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is * not irrelevant because Hyper-V chooses the physical CPU to handle the * interrupts based on the vCPU specified in message sent to the vPCI VSP in * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest, * but assigning too many vPCI device interrupts to the same pCPU can cause a * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V * to spread out the pCPUs that it selects. * * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu() * to always return the same dummy vCPU, because a second call to * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a * new pCPU for the interrupt. But for the multi-MSI case, the second call to * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that * the pCPUs are spread out. All interrupts for a multi-MSI device end up using * the same pCPU, even though the vCPUs will be spread out by later calls * to hv_irq_unmask(), but that is the best we can do now. * * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not* * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an * enhancement is planned for a future version. With that enhancement, the * dummy vCPU selection won't matter, and interrupts for the same multi-MSI * device will be spread across multiple pCPUs. */ /* * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten * by subsequent retarget in hv_irq_unmask(). */ static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity) { … } /* * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0. */ static int hv_compose_multi_msi_req_get_cpu(void) { … } static u32 hv_compose_msi_req_v2( struct pci_create_interrupt2 *int_pkt, int cpu, u32 slot, u8 vector, u16 vector_count) { … } static u32 hv_compose_msi_req_v3( struct pci_create_interrupt3 *int_pkt, int cpu, u32 slot, u32 vector, u16 vector_count) { … } /** * hv_compose_msi_msg() - Supplies a valid MSI address/data * @data: Everything about this MSI * @msg: Buffer that is filled in by this function * * This function unpacks the IRQ looking for target CPU set, IDT * vector and mode and sends a message to the parent partition * asking for a mapping for that tuple in this partition. The * response supplies a data value and address to which that data * should be written to trigger that interrupt. */ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { … } /* HW Interrupt Chip Descriptor */ static struct irq_chip hv_msi_irq_chip = …; static struct msi_domain_ops hv_msi_ops = …; /** * hv_pcie_init_irq_domain() - Initialize IRQ domain * @hbus: The root PCI bus * * This function creates an IRQ domain which will be used for * interrupts from devices that have been passed through. These * devices only support MSI and MSI-X, not line-based interrupts * or simulations of line-based interrupts through PCIe's * fabric-layer messages. Because interrupts are remapped, we * can support multi-message MSI here. * * Return: '0' on success and error value on failure */ static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) { … } /** * get_bar_size() - Get the address space consumed by a BAR * @bar_val: Value that a BAR returned after -1 was written * to it. * * This function returns the size of the BAR, rounded up to 1 * page. It has to be rounded up because the hypervisor's page * table entry that maps the BAR into the VM can't specify an * offset within a page. The invariant is that the hypervisor * must place any BARs of smaller than page length at the * beginning of a page. * * Return: Size in bytes of the consumed MMIO space. */ static u64 get_bar_size(u64 bar_val) { … } /** * survey_child_resources() - Total all MMIO requirements * @hbus: Root PCI bus, as understood by this driver */ static void survey_child_resources(struct hv_pcibus_device *hbus) { … } /** * prepopulate_bars() - Fill in BARs with defaults * @hbus: Root PCI bus, as understood by this driver * * The core PCI driver code seems much, much happier if the BARs * for a device have values upon first scan. So fill them in. * The algorithm below works down from large sizes to small, * attempting to pack the assignments optimally. The assumption, * enforced in other parts of the code, is that the beginning of * the memory-mapped I/O space will be aligned on the largest * BAR size. */ static void prepopulate_bars(struct hv_pcibus_device *hbus) { … } /* * Assign entries in sysfs pci slot directory. * * Note that this function does not need to lock the children list * because it is called from pci_devices_present_work which * is serialized with hv_eject_device_work because they are on the * same ordered workqueue. Therefore hbus->children list will not change * even when pci_create_slot sleeps. */ static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) { … } /* * Remove entries in sysfs pci slot directory. */ static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) { … } /* * Set NUMA node for the devices on the bus */ static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) { … } /** * create_root_hv_pci_bus() - Expose a new root PCI bus * @hbus: Root PCI bus, as understood by this driver * * Return: 0 on success, -errno on failure */ static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) { … } struct q_res_req_compl { … }; /** * q_resource_requirements() - Query Resource Requirements * @context: The completion context. * @resp: The response that came from the host. * @resp_packet_size: The size in bytes of resp. * * This function is invoked on completion of a Query Resource * Requirements packet. */ static void q_resource_requirements(void *context, struct pci_response *resp, int resp_packet_size) { … } /** * new_pcichild_device() - Create a new child device * @hbus: The internal struct tracking this root PCI bus. * @desc: The information supplied so far from the host * about the device. * * This function creates the tracking structure for a new child * device and kicks off the process of figuring out what it is. * * Return: Pointer to the new tracking struct */ static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, struct hv_pcidev_description *desc) { … } /** * get_pcichild_wslot() - Find device from slot * @hbus: Root PCI bus, as understood by this driver * @wslot: Location on the bus * * This function looks up a PCI device and returns the internal * representation of it. It acquires a reference on it, so that * the device won't be deleted while somebody is using it. The * caller is responsible for calling put_pcichild() to release * this reference. * * Return: Internal representation of a PCI device */ static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, u32 wslot) { … } /** * pci_devices_present_work() - Handle new list of child devices * @work: Work struct embedded in struct hv_dr_work * * "Bus Relations" is the Windows term for "children of this * bus." The terminology is preserved here for people trying to * debug the interaction between Hyper-V and Linux. This * function is called when the parent partition reports a list * of functions that should be observed under this PCI Express * port (bus). * * This function updates the list, and must tolerate being * called multiple times with the same information. The typical * number of child devices is one, with very atypical cases * involving three or four, so the algorithms used here can be * simple and inefficient. * * It must also treat the omission of a previously observed device as * notification that the device no longer exists. * * Note that this function is serialized with hv_eject_device_work(), * because both are pushed to the ordered workqueue hbus->wq. */ static void pci_devices_present_work(struct work_struct *work) { … } /** * hv_pci_start_relations_work() - Queue work to start device discovery * @hbus: Root PCI bus, as understood by this driver * @dr: The list of children returned from host * * Return: 0 on success, -errno on failure */ static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, struct hv_dr_state *dr) { … } /** * hv_pci_devices_present() - Handle list of new children * @hbus: Root PCI bus, as understood by this driver * @relations: Packet from host listing children * * Process a new list of devices on the bus. The list of devices is * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, * whenever a new list of devices for this bus appears. */ static void hv_pci_devices_present(struct hv_pcibus_device *hbus, struct pci_bus_relations *relations) { … } /** * hv_pci_devices_present2() - Handle list of new children * @hbus: Root PCI bus, as understood by this driver * @relations: Packet from host listing children * * This function is the v2 version of hv_pci_devices_present() */ static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, struct pci_bus_relations2 *relations) { … } /** * hv_eject_device_work() - Asynchronously handles ejection * @work: Work struct embedded in internal device struct * * This function handles ejecting a device. Windows will * attempt to gracefully eject a device, waiting 60 seconds to * hear back from the guest OS that this completed successfully. * If this timer expires, the device will be forcibly removed. */ static void hv_eject_device_work(struct work_struct *work) { … } /** * hv_pci_eject_device() - Handles device ejection * @hpdev: Internal device tracking struct * * This function is invoked when an ejection packet arrives. It * just schedules work so that we don't re-enter the packet * delivery code handling the ejection. */ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) { … } /** * hv_pci_onchannelcallback() - Handles incoming packets * @context: Internal bus tracking struct * * This function is invoked whenever the host sends a packet to * this channel (which is private to this root PCI bus). */ static void hv_pci_onchannelcallback(void *context) { … } /** * hv_pci_protocol_negotiation() - Set up protocol * @hdev: VMBus's tracking struct for this root PCI bus. * @version: Array of supported channel protocol versions in * the order of probing - highest go first. * @num_version: Number of elements in the version array. * * This driver is intended to support running on Windows 10 * (server) and later versions. It will not run on earlier * versions, as they assume that many of the operations which * Linux needs accomplished with a spinlock held were done via * asynchronous messaging via VMBus. Windows 10 increases the * surface area of PCI emulation so that these actions can take * place by suspending a virtual processor for their duration. * * This function negotiates the channel protocol version, * failing if the host doesn't support the necessary protocol * level. */ static int hv_pci_protocol_negotiation(struct hv_device *hdev, enum pci_protocol_version_t version[], int num_version) { … } /** * hv_pci_free_bridge_windows() - Release memory regions for the * bus * @hbus: Root PCI bus, as understood by this driver */ static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) { … } /** * hv_pci_allocate_bridge_windows() - Allocate memory regions * for the bus * @hbus: Root PCI bus, as understood by this driver * * This function calls vmbus_allocate_mmio(), which is itself a * bit of a compromise. Ideally, we might change the pnp layer * in the kernel such that it comprehends either PCI devices * which are "grandchildren of ACPI," with some intermediate bus * node (in this case, VMBus) or change it such that it * understands VMBus. The pnp layer, however, has been declared * deprecated, and not subject to change. * * The workaround, implemented here, is to ask VMBus to allocate * MMIO space for this bus. VMBus itself knows which ranges are * appropriate by looking at its own ACPI objects. Then, after * these ranges are claimed, they're modified to look like they * would have looked if the ACPI and pnp code had allocated * bridge windows. These descriptors have to exist in this form * in order to satisfy the code which will get invoked when the * endpoint PCI function driver calls request_mem_region() or * request_mem_region_exclusive(). * * Return: 0 on success, -errno on failure */ static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) { … } /** * hv_allocate_config_window() - Find MMIO space for PCI Config * @hbus: Root PCI bus, as understood by this driver * * This function claims memory-mapped I/O space for accessing * configuration space for the functions on this bus. * * Return: 0 on success, -errno on failure */ static int hv_allocate_config_window(struct hv_pcibus_device *hbus) { … } static void hv_free_config_window(struct hv_pcibus_device *hbus) { … } static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); /** * hv_pci_enter_d0() - Bring the "bus" into the D0 power state * @hdev: VMBus's tracking struct for this root PCI bus * * Return: 0 on success, -errno on failure */ static int hv_pci_enter_d0(struct hv_device *hdev) { … } /** * hv_pci_query_relations() - Ask host to send list of child * devices * @hdev: VMBus's tracking struct for this root PCI bus * * Return: 0 on success, -errno on failure */ static int hv_pci_query_relations(struct hv_device *hdev) { … } /** * hv_send_resources_allocated() - Report local resource choices * @hdev: VMBus's tracking struct for this root PCI bus * * The host OS is expecting to be sent a request as a message * which contains all the resources that the device will use. * The response contains those same resources, "translated" * which is to say, the values which should be used by the * hardware, when it delivers an interrupt. (MMIO resources are * used in local terms.) This is nice for Windows, and lines up * with the FDO/PDO split, which doesn't exist in Linux. Linux * is deeply expecting to scan an emulated PCI configuration * space. So this message is sent here only to drive the state * machine on the host forward. * * Return: 0 on success, -errno on failure */ static int hv_send_resources_allocated(struct hv_device *hdev) { … } /** * hv_send_resources_released() - Report local resources * released * @hdev: VMBus's tracking struct for this root PCI bus * * Return: 0 on success, -errno on failure */ static int hv_send_resources_released(struct hv_device *hdev) { … } #define HVPCI_DOM_MAP_SIZE … static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); /* * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 * as invalid for passthrough PCI devices of this driver. */ #define HVPCI_DOM_INVALID … /** * hv_get_dom_num() - Get a valid PCI domain number * Check if the PCI domain number is in use, and return another number if * it is in use. * * @dom: Requested domain number * * return: domain number on success, HVPCI_DOM_INVALID on failure */ static u16 hv_get_dom_num(u16 dom) { … } /** * hv_put_dom_num() - Mark the PCI domain number as free * @dom: Domain number to be freed */ static void hv_put_dom_num(u16 dom) { … } /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus * @dev_id: Identifies the device itself * * Return: 0 on success, -errno on failure */ static int hv_pci_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { … } static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) { … } /** * hv_pci_remove() - Remove routine for this VMBus channel * @hdev: VMBus's tracking struct for this root PCI bus */ static void hv_pci_remove(struct hv_device *hdev) { … } static int hv_pci_suspend(struct hv_device *hdev) { … } static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) { … } /* * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping * Table entries. */ static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) { … } static int hv_pci_resume(struct hv_device *hdev) { … } static const struct hv_vmbus_device_id hv_pci_id_table[] = …; MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); static struct hv_driver hv_pci_drv = …; static void __exit exit_hv_pci_drv(void) { … } static int __init init_hv_pci_drv(void) { … } module_init(…) …; module_exit(exit_hv_pci_drv); MODULE_DESCRIPTION(…) …; MODULE_LICENSE(…) …;