pci-hyperv.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) Microsoft Corporation.
 *
 * Author:
 *   Jake Oshins <[email protected]>
 *
 * This driver acts as a paravirtual front-end for PCI Express root buses.
 * When a PCI Express function (either an entire device or an SR-IOV
 * Virtual Function) is being passed through to the VM, this driver exposes
 * a new bus to the guest VM.  This is modeled as a root PCI bus because
 * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
 * until a device as been exposed using this driver.
 *
 * Each root PCI bus has its own PCI domain, which is called "Segment" in
 * the PCI Firmware Specifications.  Thus while each device passed through
 * to the VM using this front-end will appear at "device 0", the domain will
 * be unique.  Typically, each bus will have one PCI function on it, though
 * this driver does support more than one.
 *
 * In order to map the interrupts from the device through to the guest VM,
 * this driver also implements an IRQ Domain, which handles interrupts (either
 * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
 * set up, torn down, or reaffined, this driver communicates with the
 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
 * interrupt will be delivered to the correct virtual processor at the right
 * vector.  This driver does not support level-triggered (line-based)
 * interrupts, and will report that the Interrupt Line register in the
 * function's configuration space is zero.
 *
 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
 * facilities.  For instance, the configuration space of a function exposed
 * by Hyper-V is mapped into a single page of memory space, and the
 * read and write handlers for config space must be aware of this mechanism.
 * Similarly, device setup and teardown involves messages sent to and from
 * the PCI back-end driver in Hyper-V.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/pci-ecam.h>
#include <linux/delay.h>
#include <linux/semaphore.h>
#include <linux/irq.h>
#include <linux/msi.h>
#include <linux/hyperv.h>
#include <linux/refcount.h>
#include <linux/irqdomain.h>
#include <linux/acpi.h>
#include <linux/sizes.h>
#include <asm/mshyperv.h>

/*
 * Protocol versions. The low word is the minor version, the high word the
 * major version.
 */

#define PCI_MAKE_VERSION(major, minor) …
#define PCI_MAJOR_VERSION(version) …
#define PCI_MINOR_VERSION(version) …

enum pci_protocol_version_t { … };

#define CPU_AFFINITY_ALL …

/*
 * Supported protocol versions in the order of probing - highest go
 * first.
 */
static enum pci_protocol_version_t pci_protocol_versions[] = …;

#define PCI_CONFIG_MMIO_LENGTH …
#define CFG_PAGE_OFFSET …
#define CFG_PAGE_SIZE …

#define MAX_SUPPORTED_MSI_MESSAGES …

#define STATUS_REVISION_MISMATCH …

/* space for 32bit serial number as string */
#define SLOT_NAME_SIZE …

/*
 * Size of requestor for VMbus; the value is based on the observation
 * that having more than one request outstanding is 'rare', and so 64
 * should be generous in ensuring that we don't ever run out.
 */
#define HV_PCI_RQSTOR_SIZE …

/*
 * Message Types
 */

enum pci_message_type { … };

/*
 * Structures defining the virtual PCI Express protocol.
 */

pci_version __packed;

/*
 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
 * which is all this driver does.  This representation is the one used in
 * Windows, which is what is expected when sending this back and forth with
 * the Hyper-V parent partition.
 */
win_slot_encoding __packed;

/*
 * Pretty much as defined in the PCI Specifications.
 */
struct pci_function_description { … } __packed;

enum pci_device_description_flags { … };

struct pci_function_description2 { … } __packed;

/**
 * struct hv_msi_desc
 * @vector:		IDT entry
 * @delivery_mode:	As defined in Intel's Programmer's
 *			Reference Manual, Volume 3, Chapter 8.
 * @vector_count:	Number of contiguous entries in the
 *			Interrupt Descriptor Table that are
 *			occupied by this Message-Signaled
 *			Interrupt. For "MSI", as first defined
 *			in PCI 2.2, this can be between 1 and
 *			32. For "MSI-X," as first defined in PCI
 *			3.0, this must be 1, as each MSI-X table
 *			entry would have its own descriptor.
 * @reserved:		Empty space
 * @cpu_mask:		All the target virtual processors.
 */
struct hv_msi_desc { … } __packed;

/**
 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
 * @vector:		IDT entry
 * @delivery_mode:	As defined in Intel's Programmer's
 *			Reference Manual, Volume 3, Chapter 8.
 * @vector_count:	Number of contiguous entries in the
 *			Interrupt Descriptor Table that are
 *			occupied by this Message-Signaled
 *			Interrupt. For "MSI", as first defined
 *			in PCI 2.2, this can be between 1 and
 *			32. For "MSI-X," as first defined in PCI
 *			3.0, this must be 1, as each MSI-X table
 *			entry would have its own descriptor.
 * @processor_count:	number of bits enabled in array.
 * @processor_array:	All the target virtual processors.
 */
struct hv_msi_desc2 { … } __packed;

/*
 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc
 *	Everything is the same as in 'hv_msi_desc2' except that the size of the
 *	'vector' field is larger to support bigger vector values. For ex: LPI
 *	vectors on ARM.
 */
struct hv_msi_desc3 { … } __packed;

/**
 * struct tran_int_desc
 * @reserved:		unused, padding
 * @vector_count:	same as in hv_msi_desc
 * @data:		This is the "data payload" value that is
 *			written by the device when it generates
 *			a message-signaled interrupt, either MSI
 *			or MSI-X.
 * @address:		This is the address to which the data
 *			payload is written on interrupt
 *			generation.
 */
struct tran_int_desc { … } __packed;

/*
 * A generic message format for virtual PCI.
 * Specific message formats are defined later in the file.
 */

struct pci_message { … } __packed;

struct pci_child_message { … } __packed;

struct pci_incoming_message { … } __packed;

struct pci_response { … } __packed;

struct pci_packet { … };

/*
 * Specific message types supporting the PCI protocol.
 */

/*
 * Version negotiation message. Sent from the guest to the host.
 * The guest is free to try different versions until the host
 * accepts the version.
 *
 * pci_version: The protocol version requested.
 * is_last_attempt: If TRUE, this is the last version guest will request.
 * reservedz: Reserved field, set to zero.
 */

struct pci_version_request { … } __packed;

/*
 * Bus D0 Entry.  This is sent from the guest to the host when the virtual
 * bus (PCI Express port) is ready for action.
 */

struct pci_bus_d0_entry { … } __packed;

struct pci_bus_relations { … } __packed;

struct pci_bus_relations2 { … } __packed;

struct pci_q_res_req_response { … } __packed;

struct pci_set_power { … } __packed;

struct pci_set_power_response { … } __packed;

struct pci_resources_assigned { … } __packed;

struct pci_resources_assigned2 { … } __packed;

struct pci_create_interrupt { … } __packed;

struct pci_create_int_response { … } __packed;

struct pci_create_interrupt2 { … } __packed;

struct pci_create_interrupt3 { … } __packed;

struct pci_delete_interrupt { … } __packed;

/*
 * Note: the VM must pass a valid block id, wslot and bytes_requested.
 */
struct pci_read_block { … } __packed;

struct pci_read_block_response { … } __packed;

/*
 * Note: the VM must pass a valid block id, wslot and byte_count.
 */
struct pci_write_block { … } __packed;

struct pci_dev_inval_block { … } __packed;

struct pci_dev_incoming { … } __packed;

struct pci_eject_response { … } __packed;

static int pci_ring_size = …;

/*
 * Driver specific state.
 */

enum hv_pcibus_state { … };

struct hv_pcibus_device { … };

/*
 * Tracks "Device Relations" messages from the host, which must be both
 * processed in order and deferred so that they don't run in the context
 * of the incoming packet callback.
 */
struct hv_dr_work { … };

struct hv_pcidev_description { … };

struct hv_dr_state { … };

struct hv_pci_dev { … };

struct hv_pci_compl { … };

static void hv_pci_onchannelcallback(void *context);

#ifdef CONFIG_X86
#define DELIVERY_MODE …
#define FLOW_HANDLER …
#define FLOW_NAME …

static int hv_pci_irqchip_init(void)
{ … }

static struct irq_domain *hv_pci_get_root_domain(void)
{ … }

static unsigned int hv_msi_get_int_vector(struct irq_data *data)
{ … }

#define hv_msi_prepare …

/**
 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current
 * affinity.
 * @data:	Describes the IRQ
 *
 * Build new a destination for the MSI and make a hypercall to
 * update the Interrupt Redirection Table. "Device Logical ID"
 * is built out of this PCI bus's instance GUID and the function
 * number of the device.
 */
static void hv_arch_irq_unmask(struct irq_data *data)
{ … }
#elif defined(CONFIG_ARM64)
/*
 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit
 * of room at the start to allow for SPIs to be specified through ACPI and
 * starting with a power of two to satisfy power of 2 multi-MSI requirement.
 */
#define HV_PCI_MSI_SPI_START …
#define HV_PCI_MSI_SPI_NR …
#define DELIVERY_MODE …
#define FLOW_HANDLER …
#define FLOW_NAME …
#define hv_msi_prepare …

struct hv_pci_chip_data {
	DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR);
	struct mutex	map_lock;
};

/* Hyper-V vPCI MSI GIC IRQ domain */
static struct irq_domain *hv_msi_gic_irq_domain;

/* Hyper-V PCI MSI IRQ chip */
static struct irq_chip hv_arm64_msi_irq_chip = {
	.name = "MSI",
	.irq_set_affinity = irq_chip_set_affinity_parent,
	.irq_eoi = irq_chip_eoi_parent,
	.irq_mask = irq_chip_mask_parent,
	.irq_unmask = irq_chip_unmask_parent
};

static unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
{
	return irqd->parent_data->hwirq;
}

/*
 * @nr_bm_irqs:		Indicates the number of IRQs that were allocated from
 *			the bitmap.
 * @nr_dom_irqs:	Indicates the number of IRQs that were allocated from
 *			the parent domain.
 */
static void hv_pci_vec_irq_free(struct irq_domain *domain,
				unsigned int virq,
				unsigned int nr_bm_irqs,
				unsigned int nr_dom_irqs)
{
	struct hv_pci_chip_data *chip_data = domain->host_data;
	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
	int first = d->hwirq - HV_PCI_MSI_SPI_START;
	int i;

	mutex_lock(&chip_data->map_lock);
	bitmap_release_region(chip_data->spi_map,
			      first,
			      get_count_order(nr_bm_irqs));
	mutex_unlock(&chip_data->map_lock);
	for (i = 0; i < nr_dom_irqs; i++) {
		if (i)
			d = irq_domain_get_irq_data(domain, virq + i);
		irq_domain_reset_irq_data(d);
	}

	irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs);
}

static void hv_pci_vec_irq_domain_free(struct irq_domain *domain,
				       unsigned int virq,
				       unsigned int nr_irqs)
{
	hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs);
}

static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain,
				       unsigned int nr_irqs,
				       irq_hw_number_t *hwirq)
{
	struct hv_pci_chip_data *chip_data = domain->host_data;
	int index;

	/* Find and allocate region from the SPI bitmap */
	mutex_lock(&chip_data->map_lock);
	index = bitmap_find_free_region(chip_data->spi_map,
					HV_PCI_MSI_SPI_NR,
					get_count_order(nr_irqs));
	mutex_unlock(&chip_data->map_lock);
	if (index < 0)
		return -ENOSPC;

	*hwirq = index + HV_PCI_MSI_SPI_START;

	return 0;
}

static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain,
					   unsigned int virq,
					   irq_hw_number_t hwirq)
{
	struct irq_fwspec fwspec;
	struct irq_data *d;
	int ret;

	fwspec.fwnode = domain->parent->fwnode;
	fwspec.param_count = 2;
	fwspec.param[0] = hwirq;
	fwspec.param[1] = IRQ_TYPE_EDGE_RISING;

	ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
	if (ret)
		return ret;

	/*
	 * Since the interrupt specifier is not coming from ACPI or DT, the
	 * trigger type will need to be set explicitly. Otherwise, it will be
	 * set to whatever is in the GIC configuration.
	 */
	d = irq_domain_get_irq_data(domain->parent, virq);

	return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
}

static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain,
				       unsigned int virq, unsigned int nr_irqs,
				       void *args)
{
	irq_hw_number_t hwirq;
	unsigned int i;
	int ret;

	ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq);
	if (ret)
		return ret;

	for (i = 0; i < nr_irqs; i++) {
		ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i,
						      hwirq + i);
		if (ret) {
			hv_pci_vec_irq_free(domain, virq, nr_irqs, i);
			return ret;
		}

		irq_domain_set_hwirq_and_chip(domain, virq + i,
					      hwirq + i,
					      &hv_arm64_msi_irq_chip,
					      domain->host_data);
		pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i);
	}

	return 0;
}

/*
 * Pick the first cpu as the irq affinity that can be temporarily used for
 * composing MSI from the hypervisor. GIC will eventually set the right
 * affinity for the irq and the 'unmask' will retarget the interrupt to that
 * cpu.
 */
static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain,
					  struct irq_data *irqd, bool reserve)
{
	int cpu = cpumask_first(cpu_present_mask);

	irq_data_update_effective_affinity(irqd, cpumask_of(cpu));

	return 0;
}

static const struct irq_domain_ops hv_pci_domain_ops = {
	.alloc	= hv_pci_vec_irq_domain_alloc,
	.free	= hv_pci_vec_irq_domain_free,
	.activate = hv_pci_vec_irq_domain_activate,
};

static int hv_pci_irqchip_init(void)
{
	static struct hv_pci_chip_data *chip_data;
	struct fwnode_handle *fn = NULL;
	int ret = -ENOMEM;

	chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
	if (!chip_data)
		return ret;

	mutex_init(&chip_data->map_lock);
	fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64");
	if (!fn)
		goto free_chip;

	/*
	 * IRQ domain once enabled, should not be removed since there is no
	 * way to ensure that all the corresponding devices are also gone and
	 * no interrupts will be generated.
	 */
	hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR,
							  fn, &hv_pci_domain_ops,
							  chip_data);

	if (!hv_msi_gic_irq_domain) {
		pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n");
		goto free_chip;
	}

	return 0;

free_chip:
	kfree(chip_data);
	if (fn)
		irq_domain_free_fwnode(fn);

	return ret;
}

static struct irq_domain *hv_pci_get_root_domain(void)
{
	return hv_msi_gic_irq_domain;
}

/*
 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD
 * registers which Hyper-V already supports, so no hypercall needed.
 */
static void hv_arch_irq_unmask(struct irq_data *data) { }
#endif /* CONFIG_ARM64 */

/**
 * hv_pci_generic_compl() - Invoked for a completion packet
 * @context:		Set up by the sender of the packet.
 * @resp:		The response packet
 * @resp_packet_size:	Size in bytes of the packet
 *
 * This function is used to trigger an event and report status
 * for any message for which the completion packet contains a
 * status and nothing else.
 */
static void hv_pci_generic_compl(void *context, struct pci_response *resp,
				 int resp_packet_size)
{ … }

static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
						u32 wslot);

static void get_pcichild(struct hv_pci_dev *hpdev)
{ … }

static void put_pcichild(struct hv_pci_dev *hpdev)
{ … }

/*
 * There is no good way to get notified from vmbus_onoffer_rescind(),
 * so let's use polling here, since this is not a hot path.
 */
static int wait_for_response(struct hv_device *hdev,
			     struct completion *comp)
{ … }

/**
 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
 * @devfn:	The Linux representation of PCI slot
 *
 * Windows uses a slightly different representation of PCI slot.
 *
 * Return: The Windows representation
 */
static u32 devfn_to_wslot(int devfn)
{ … }

/**
 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
 * @wslot:	The Windows representation of PCI slot
 *
 * Windows uses a slightly different representation of PCI slot.
 *
 * Return: The Linux representation
 */
static int wslot_to_devfn(u32 wslot)
{ … }

static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val)
{ … }

static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val)
{ … }

/*
 * PCI Configuration Space for these root PCI buses is implemented as a pair
 * of pages in memory-mapped I/O space.  Writing to the first page chooses
 * the PCI function being written or read.  Once the first page has been
 * written to, the following page maps in the entire configuration space of
 * the function.
 */

/**
 * _hv_pcifront_read_config() - Internal PCI config read
 * @hpdev:	The PCI driver's representation of the device
 * @where:	Offset within config space
 * @size:	Size of the transfer
 * @val:	Pointer to the buffer receiving the data
 */
static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
				     int size, u32 *val)
{ … }

static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
{ … }

/**
 * _hv_pcifront_write_config() - Internal PCI config write
 * @hpdev:	The PCI driver's representation of the device
 * @where:	Offset within config space
 * @size:	Size of the transfer
 * @val:	The data being transferred
 */
static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
				      int size, u32 val)
{ … }

/**
 * hv_pcifront_read_config() - Read configuration space
 * @bus: PCI Bus structure
 * @devfn: Device/function
 * @where: Offset from base
 * @size: Byte/word/dword
 * @val: Value to be read
 *
 * Return: PCIBIOS_SUCCESSFUL on success
 *	   PCIBIOS_DEVICE_NOT_FOUND on failure
 */
static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
				   int where, int size, u32 *val)
{ … }

/**
 * hv_pcifront_write_config() - Write configuration space
 * @bus: PCI Bus structure
 * @devfn: Device/function
 * @where: Offset from base
 * @size: Byte/word/dword
 * @val: Value to be written to device
 *
 * Return: PCIBIOS_SUCCESSFUL on success
 *	   PCIBIOS_DEVICE_NOT_FOUND on failure
 */
static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
				    int where, int size, u32 val)
{ … }

/* PCIe operations */
static struct pci_ops hv_pcifront_ops = …;

/*
 * Paravirtual backchannel
 *
 * Hyper-V SR-IOV provides a backchannel mechanism in software for
 * communication between a VF driver and a PF driver.  These
 * "configuration blocks" are similar in concept to PCI configuration space,
 * but instead of doing reads and writes in 32-bit chunks through a very slow
 * path, packets of up to 128 bytes can be sent or received asynchronously.
 *
 * Nearly every SR-IOV device contains just such a communications channel in
 * hardware, so using this one in software is usually optional.  Using the
 * software channel, however, allows driver implementers to leverage software
 * tools that fuzz the communications channel looking for vulnerabilities.
 *
 * The usage model for these packets puts the responsibility for reading or
 * writing on the VF driver.  The VF driver sends a read or a write packet,
 * indicating which "block" is being referred to by number.
 *
 * If the PF driver wishes to initiate communication, it can "invalidate" one or
 * more of the first 64 blocks.  This invalidation is delivered via a callback
 * supplied by the VF driver by this driver.
 *
 * No protocol is implied, except that supplied by the PF and VF drivers.
 */

struct hv_read_config_compl { … };

/**
 * hv_pci_read_config_compl() - Invoked when a response packet
 * for a read config block operation arrives.
 * @context:		Identifies the read config operation
 * @resp:		The response packet itself
 * @resp_packet_size:	Size in bytes of the response packet
 */
static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
				     int resp_packet_size)
{ … }

/**
 * hv_read_config_block() - Sends a read config block request to
 * the back-end driver running in the Hyper-V parent partition.
 * @pdev:		The PCI driver's representation for this device.
 * @buf:		Buffer into which the config block will be copied.
 * @len:		Size in bytes of buf.
 * @block_id:		Identifies the config block which has been requested.
 * @bytes_returned:	Size which came back from the back-end driver.
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_read_config_block(struct pci_dev *pdev, void *buf,
				unsigned int len, unsigned int block_id,
				unsigned int *bytes_returned)
{ … }

/**
 * hv_pci_write_config_compl() - Invoked when a response packet for a write
 * config block operation arrives.
 * @context:		Identifies the write config operation
 * @resp:		The response packet itself
 * @resp_packet_size:	Size in bytes of the response packet
 */
static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
				      int resp_packet_size)
{ … }

/**
 * hv_write_config_block() - Sends a write config block request to the
 * back-end driver running in the Hyper-V parent partition.
 * @pdev:		The PCI driver's representation for this device.
 * @buf:		Buffer from which the config block will	be copied.
 * @len:		Size in bytes of buf.
 * @block_id:		Identifies the config block which is being written.
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_write_config_block(struct pci_dev *pdev, void *buf,
				unsigned int len, unsigned int block_id)
{ … }

/**
 * hv_register_block_invalidate() - Invoked when a config block invalidation
 * arrives from the back-end driver.
 * @pdev:		The PCI driver's representation for this device.
 * @context:		Identifies the device.
 * @block_invalidate:	Identifies all of the blocks being invalidated.
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
					void (*block_invalidate)(void *context,
								 u64 block_mask))
{ … }

/* Interrupt management hooks */
static void hv_int_desc_free(struct hv_pci_dev *hpdev,
			     struct tran_int_desc *int_desc)
{ … }

/**
 * hv_msi_free() - Free the MSI.
 * @domain:	The interrupt domain pointer
 * @info:	Extra MSI-related context
 * @irq:	Identifies the IRQ.
 *
 * The Hyper-V parent partition and hypervisor are tracking the
 * messages that are in use, keeping the interrupt redirection
 * table up to date.  This callback sends a message that frees
 * the IRT entry and related tracking nonsense.
 */
static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
			unsigned int irq)
{ … }

static void hv_irq_mask(struct irq_data *data)
{ … }

static void hv_irq_unmask(struct irq_data *data)
{ … }

struct compose_comp_ctxt { … };

static void hv_pci_compose_compl(void *context, struct pci_response *resp,
				 int resp_packet_size)
{ … }

static u32 hv_compose_msi_req_v1(
	struct pci_create_interrupt *int_pkt,
	u32 slot, u8 vector, u16 vector_count)
{ … }

/*
 * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and
 * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be
 * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V
 * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is
 * not irrelevant because Hyper-V chooses the physical CPU to handle the
 * interrupts based on the vCPU specified in message sent to the vPCI VSP in
 * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest,
 * but assigning too many vPCI device interrupts to the same pCPU can cause a
 * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V
 * to spread out the pCPUs that it selects.
 *
 * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu()
 * to always return the same dummy vCPU, because a second call to
 * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a
 * new pCPU for the interrupt. But for the multi-MSI case, the second call to
 * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the
 * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that
 * the pCPUs are spread out. All interrupts for a multi-MSI device end up using
 * the same pCPU, even though the vCPUs will be spread out by later calls
 * to hv_irq_unmask(), but that is the best we can do now.
 *
 * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not*
 * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an
 * enhancement is planned for a future version. With that enhancement, the
 * dummy vCPU selection won't matter, and interrupts for the same multi-MSI
 * device will be spread across multiple pCPUs.
 */

/*
 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
 * by subsequent retarget in hv_irq_unmask().
 */
static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
{ … }

/*
 * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0.
 */
static int hv_compose_multi_msi_req_get_cpu(void)
{ … }

static u32 hv_compose_msi_req_v2(
	struct pci_create_interrupt2 *int_pkt, int cpu,
	u32 slot, u8 vector, u16 vector_count)
{ … }

static u32 hv_compose_msi_req_v3(
	struct pci_create_interrupt3 *int_pkt, int cpu,
	u32 slot, u32 vector, u16 vector_count)
{ … }

/**
 * hv_compose_msi_msg() - Supplies a valid MSI address/data
 * @data:	Everything about this MSI
 * @msg:	Buffer that is filled in by this function
 *
 * This function unpacks the IRQ looking for target CPU set, IDT
 * vector and mode and sends a message to the parent partition
 * asking for a mapping for that tuple in this partition.  The
 * response supplies a data value and address to which that data
 * should be written to trigger that interrupt.
 */
static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{ … }

/* HW Interrupt Chip Descriptor */
static struct irq_chip hv_msi_irq_chip = …;

static struct msi_domain_ops hv_msi_ops = …;

/**
 * hv_pcie_init_irq_domain() - Initialize IRQ domain
 * @hbus:	The root PCI bus
 *
 * This function creates an IRQ domain which will be used for
 * interrupts from devices that have been passed through.  These
 * devices only support MSI and MSI-X, not line-based interrupts
 * or simulations of line-based interrupts through PCIe's
 * fabric-layer messages.  Because interrupts are remapped, we
 * can support multi-message MSI here.
 *
 * Return: '0' on success and error value on failure
 */
static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
{ … }

/**
 * get_bar_size() - Get the address space consumed by a BAR
 * @bar_val:	Value that a BAR returned after -1 was written
 *              to it.
 *
 * This function returns the size of the BAR, rounded up to 1
 * page.  It has to be rounded up because the hypervisor's page
 * table entry that maps the BAR into the VM can't specify an
 * offset within a page.  The invariant is that the hypervisor
 * must place any BARs of smaller than page length at the
 * beginning of a page.
 *
 * Return:	Size in bytes of the consumed MMIO space.
 */
static u64 get_bar_size(u64 bar_val)
{ … }

/**
 * survey_child_resources() - Total all MMIO requirements
 * @hbus:	Root PCI bus, as understood by this driver
 */
static void survey_child_resources(struct hv_pcibus_device *hbus)
{ … }

/**
 * prepopulate_bars() - Fill in BARs with defaults
 * @hbus:	Root PCI bus, as understood by this driver
 *
 * The core PCI driver code seems much, much happier if the BARs
 * for a device have values upon first scan. So fill them in.
 * The algorithm below works down from large sizes to small,
 * attempting to pack the assignments optimally. The assumption,
 * enforced in other parts of the code, is that the beginning of
 * the memory-mapped I/O space will be aligned on the largest
 * BAR size.
 */
static void prepopulate_bars(struct hv_pcibus_device *hbus)
{ … }

/*
 * Assign entries in sysfs pci slot directory.
 *
 * Note that this function does not need to lock the children list
 * because it is called from pci_devices_present_work which
 * is serialized with hv_eject_device_work because they are on the
 * same ordered workqueue. Therefore hbus->children list will not change
 * even when pci_create_slot sleeps.
 */
static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
{ … }

/*
 * Remove entries in sysfs pci slot directory.
 */
static void hv_pci_remove_slots(struct hv_pcibus_device *hbus)
{ … }

/*
 * Set NUMA node for the devices on the bus
 */
static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
{ … }

/**
 * create_root_hv_pci_bus() - Expose a new root PCI bus
 * @hbus:	Root PCI bus, as understood by this driver
 *
 * Return: 0 on success, -errno on failure
 */
static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
{ … }

struct q_res_req_compl { … };

/**
 * q_resource_requirements() - Query Resource Requirements
 * @context:		The completion context.
 * @resp:		The response that came from the host.
 * @resp_packet_size:	The size in bytes of resp.
 *
 * This function is invoked on completion of a Query Resource
 * Requirements packet.
 */
static void q_resource_requirements(void *context, struct pci_response *resp,
				    int resp_packet_size)
{ … }

/**
 * new_pcichild_device() - Create a new child device
 * @hbus:	The internal struct tracking this root PCI bus.
 * @desc:	The information supplied so far from the host
 *              about the device.
 *
 * This function creates the tracking structure for a new child
 * device and kicks off the process of figuring out what it is.
 *
 * Return: Pointer to the new tracking struct
 */
static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
		struct hv_pcidev_description *desc)
{ … }

/**
 * get_pcichild_wslot() - Find device from slot
 * @hbus:	Root PCI bus, as understood by this driver
 * @wslot:	Location on the bus
 *
 * This function looks up a PCI device and returns the internal
 * representation of it.  It acquires a reference on it, so that
 * the device won't be deleted while somebody is using it.  The
 * caller is responsible for calling put_pcichild() to release
 * this reference.
 *
 * Return:	Internal representation of a PCI device
 */
static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
					     u32 wslot)
{ … }

/**
 * pci_devices_present_work() - Handle new list of child devices
 * @work:	Work struct embedded in struct hv_dr_work
 *
 * "Bus Relations" is the Windows term for "children of this
 * bus."  The terminology is preserved here for people trying to
 * debug the interaction between Hyper-V and Linux.  This
 * function is called when the parent partition reports a list
 * of functions that should be observed under this PCI Express
 * port (bus).
 *
 * This function updates the list, and must tolerate being
 * called multiple times with the same information.  The typical
 * number of child devices is one, with very atypical cases
 * involving three or four, so the algorithms used here can be
 * simple and inefficient.
 *
 * It must also treat the omission of a previously observed device as
 * notification that the device no longer exists.
 *
 * Note that this function is serialized with hv_eject_device_work(),
 * because both are pushed to the ordered workqueue hbus->wq.
 */
static void pci_devices_present_work(struct work_struct *work)
{ … }

/**
 * hv_pci_start_relations_work() - Queue work to start device discovery
 * @hbus:	Root PCI bus, as understood by this driver
 * @dr:		The list of children returned from host
 *
 * Return:  0 on success, -errno on failure
 */
static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
				       struct hv_dr_state *dr)
{ … }

/**
 * hv_pci_devices_present() - Handle list of new children
 * @hbus:      Root PCI bus, as understood by this driver
 * @relations: Packet from host listing children
 *
 * Process a new list of devices on the bus. The list of devices is
 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
 * whenever a new list of devices for this bus appears.
 */
static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
				   struct pci_bus_relations *relations)
{ … }

/**
 * hv_pci_devices_present2() - Handle list of new children
 * @hbus:	Root PCI bus, as understood by this driver
 * @relations:	Packet from host listing children
 *
 * This function is the v2 version of hv_pci_devices_present()
 */
static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
				    struct pci_bus_relations2 *relations)
{ … }

/**
 * hv_eject_device_work() - Asynchronously handles ejection
 * @work:	Work struct embedded in internal device struct
 *
 * This function handles ejecting a device.  Windows will
 * attempt to gracefully eject a device, waiting 60 seconds to
 * hear back from the guest OS that this completed successfully.
 * If this timer expires, the device will be forcibly removed.
 */
static void hv_eject_device_work(struct work_struct *work)
{ … }

/**
 * hv_pci_eject_device() - Handles device ejection
 * @hpdev:	Internal device tracking struct
 *
 * This function is invoked when an ejection packet arrives.  It
 * just schedules work so that we don't re-enter the packet
 * delivery code handling the ejection.
 */
static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
{ … }

/**
 * hv_pci_onchannelcallback() - Handles incoming packets
 * @context:	Internal bus tracking struct
 *
 * This function is invoked whenever the host sends a packet to
 * this channel (which is private to this root PCI bus).
 */
static void hv_pci_onchannelcallback(void *context)
{ … }

/**
 * hv_pci_protocol_negotiation() - Set up protocol
 * @hdev:		VMBus's tracking struct for this root PCI bus.
 * @version:		Array of supported channel protocol versions in
 *			the order of probing - highest go first.
 * @num_version:	Number of elements in the version array.
 *
 * This driver is intended to support running on Windows 10
 * (server) and later versions. It will not run on earlier
 * versions, as they assume that many of the operations which
 * Linux needs accomplished with a spinlock held were done via
 * asynchronous messaging via VMBus.  Windows 10 increases the
 * surface area of PCI emulation so that these actions can take
 * place by suspending a virtual processor for their duration.
 *
 * This function negotiates the channel protocol version,
 * failing if the host doesn't support the necessary protocol
 * level.
 */
static int hv_pci_protocol_negotiation(struct hv_device *hdev,
				       enum pci_protocol_version_t version[],
				       int num_version)
{ … }

/**
 * hv_pci_free_bridge_windows() - Release memory regions for the
 * bus
 * @hbus:	Root PCI bus, as understood by this driver
 */
static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
{ … }

/**
 * hv_pci_allocate_bridge_windows() - Allocate memory regions
 * for the bus
 * @hbus:	Root PCI bus, as understood by this driver
 *
 * This function calls vmbus_allocate_mmio(), which is itself a
 * bit of a compromise.  Ideally, we might change the pnp layer
 * in the kernel such that it comprehends either PCI devices
 * which are "grandchildren of ACPI," with some intermediate bus
 * node (in this case, VMBus) or change it such that it
 * understands VMBus.  The pnp layer, however, has been declared
 * deprecated, and not subject to change.
 *
 * The workaround, implemented here, is to ask VMBus to allocate
 * MMIO space for this bus.  VMBus itself knows which ranges are
 * appropriate by looking at its own ACPI objects.  Then, after
 * these ranges are claimed, they're modified to look like they
 * would have looked if the ACPI and pnp code had allocated
 * bridge windows.  These descriptors have to exist in this form
 * in order to satisfy the code which will get invoked when the
 * endpoint PCI function driver calls request_mem_region() or
 * request_mem_region_exclusive().
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
{ … }

/**
 * hv_allocate_config_window() - Find MMIO space for PCI Config
 * @hbus:	Root PCI bus, as understood by this driver
 *
 * This function claims memory-mapped I/O space for accessing
 * configuration space for the functions on this bus.
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
{ … }

static void hv_free_config_window(struct hv_pcibus_device *hbus)
{ … }

static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);

/**
 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
 * @hdev:	VMBus's tracking struct for this root PCI bus
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_pci_enter_d0(struct hv_device *hdev)
{ … }

/**
 * hv_pci_query_relations() - Ask host to send list of child
 * devices
 * @hdev:	VMBus's tracking struct for this root PCI bus
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_pci_query_relations(struct hv_device *hdev)
{ … }

/**
 * hv_send_resources_allocated() - Report local resource choices
 * @hdev:	VMBus's tracking struct for this root PCI bus
 *
 * The host OS is expecting to be sent a request as a message
 * which contains all the resources that the device will use.
 * The response contains those same resources, "translated"
 * which is to say, the values which should be used by the
 * hardware, when it delivers an interrupt.  (MMIO resources are
 * used in local terms.)  This is nice for Windows, and lines up
 * with the FDO/PDO split, which doesn't exist in Linux.  Linux
 * is deeply expecting to scan an emulated PCI configuration
 * space.  So this message is sent here only to drive the state
 * machine on the host forward.
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_send_resources_allocated(struct hv_device *hdev)
{ … }

/**
 * hv_send_resources_released() - Report local resources
 * released
 * @hdev:	VMBus's tracking struct for this root PCI bus
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_send_resources_released(struct hv_device *hdev)
{ … }

#define HVPCI_DOM_MAP_SIZE …
static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);

/*
 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
 * as invalid for passthrough PCI devices of this driver.
 */
#define HVPCI_DOM_INVALID …

/**
 * hv_get_dom_num() - Get a valid PCI domain number
 * Check if the PCI domain number is in use, and return another number if
 * it is in use.
 *
 * @dom: Requested domain number
 *
 * return: domain number on success, HVPCI_DOM_INVALID on failure
 */
static u16 hv_get_dom_num(u16 dom)
{ … }

/**
 * hv_put_dom_num() - Mark the PCI domain number as free
 * @dom: Domain number to be freed
 */
static void hv_put_dom_num(u16 dom)
{ … }

/**
 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
 * @hdev:	VMBus's tracking struct for this root PCI bus
 * @dev_id:	Identifies the device itself
 *
 * Return: 0 on success, -errno on failure
 */
static int hv_pci_probe(struct hv_device *hdev,
			const struct hv_vmbus_device_id *dev_id)
{ … }

static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
{ … }

/**
 * hv_pci_remove() - Remove routine for this VMBus channel
 * @hdev:	VMBus's tracking struct for this root PCI bus
 */
static void hv_pci_remove(struct hv_device *hdev)
{ … }

static int hv_pci_suspend(struct hv_device *hdev)
{ … }

static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
{ … }

/*
 * Upon resume, pci_restore_msi_state() -> ... ->  __pci_write_msi_msg()
 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
 * Table entries.
 */
static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
{ … }

static int hv_pci_resume(struct hv_device *hdev)
{ … }

static const struct hv_vmbus_device_id hv_pci_id_table[] = …;

MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);

static struct hv_driver hv_pci_drv = …;

static void __exit exit_hv_pci_drv(void)
{ … }

static int __init init_hv_pci_drv(void)
{ … }

module_init(…) …;
module_exit(exit_hv_pci_drv);

MODULE_DESCRIPTION(…) …;
MODULE_LICENSE(…) …;
linux/drivers/pci/controller/pci-hyperv.c