diff options
-rw-r--r-- | Documentation/powerpc/pci_iov_resource_on_powernv.txt | 301 | ||||
-rw-r--r-- | arch/powerpc/include/asm/iommu.h | 3 | ||||
-rw-r--r-- | arch/powerpc/include/asm/machdep.h | 5 | ||||
-rw-r--r-- | arch/powerpc/include/asm/pci-bridge.h | 13 | ||||
-rw-r--r-- | arch/powerpc/kernel/pci-common.c | 20 | ||||
-rw-r--r-- | arch/powerpc/kernel/pci_dn.c | 129 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 772 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 18 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 9 | ||||
-rw-r--r-- | drivers/pci/iov.c | 155 | ||||
-rw-r--r-- | drivers/pci/pci.h | 2 | ||||
-rw-r--r-- | drivers/pci/setup-bus.c | 95 | ||||
-rw-r--r-- | include/linux/pci.h | 15 |
13 files changed, 1448 insertions, 89 deletions
diff --git a/Documentation/powerpc/pci_iov_resource_on_powernv.txt b/Documentation/powerpc/pci_iov_resource_on_powernv.txt new file mode 100644 index 000000000000..b55c5cd83f8d --- /dev/null +++ b/Documentation/powerpc/pci_iov_resource_on_powernv.txt @@ -0,0 +1,301 @@ +Wei Yang <weiyang@linux.vnet.ibm.com> +Benjamin Herrenschmidt <benh@au1.ibm.com> +Bjorn Helgaas <bhelgaas@google.com> +26 Aug 2014 + +This document describes the requirement from hardware for PCI MMIO resource +sizing and assignment on PowerKVM and how generic PCI code handles this +requirement. The first two sections describe the concepts of Partitionable +Endpoints and the implementation on P8 (IODA2). The next two sections talks +about considerations on enabling SRIOV on IODA2. + +1. Introduction to Partitionable Endpoints + +A Partitionable Endpoint (PE) is a way to group the various resources +associated with a device or a set of devices to provide isolation between +partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism +to freeze a device that is causing errors in order to limit the possibility +of propagation of bad data. + +There is thus, in HW, a table of PE states that contains a pair of "frozen" +state bits (one for MMIO and one for DMA, they get set together but can be +cleared independently) for each PE. + +When a PE is frozen, all stores in any direction are dropped and all loads +return all 1's value. MSIs are also blocked. There's a bit more state that +captures things like the details of the error that caused the freeze etc., but +that's not critical. + +The interesting part is how the various PCIe transactions (MMIO, DMA, ...) +are matched to their corresponding PEs. + +The following section provides a rough description of what we have on P8 +(IODA2). Keep in mind that this is all per PHB (PCI host bridge). Each PHB +is a completely separate HW entity that replicates the entire logic, so has +its own set of PEs, etc. + +2. Implementation of Partitionable Endpoints on P8 (IODA2) + +P8 supports up to 256 Partitionable Endpoints per PHB. + + * Inbound + + For DMA, MSIs and inbound PCIe error messages, we have a table (in + memory but accessed in HW by the chip) that provides a direct + correspondence between a PCIe RID (bus/dev/fn) with a PE number. + We call this the RTT. + + - For DMA we then provide an entire address space for each PE that can + contain two "windows", depending on the value of PCI address bit 59. + Each window can be configured to be remapped via a "TCE table" (IOMMU + translation table), which has various configurable characteristics + not described here. + + - For MSIs, we have two windows in the address space (one at the top of + the 32-bit space and one much higher) which, via a combination of the + address and MSI value, will result in one of the 2048 interrupts per + bridge being triggered. There's a PE# in the interrupt controller + descriptor table as well which is compared with the PE# obtained from + the RTT to "authorize" the device to emit that specific interrupt. + + - Error messages just use the RTT. + + * Outbound. That's where the tricky part is. + + Like other PCI host bridges, the Power8 IODA2 PHB supports "windows" + from the CPU address space to the PCI address space. There is one M32 + window and sixteen M64 windows. They have different characteristics. + First what they have in common: they forward a configurable portion of + the CPU address space to the PCIe bus and must be naturally aligned + power of two in size. The rest is different: + + - The M32 window: + + * Is limited to 4GB in size. + + * Drops the top bits of the address (above the size) and replaces + them with a configurable value. This is typically used to generate + 32-bit PCIe accesses. We configure that window at boot from FW and + don't touch it from Linux; it's usually set to forward a 2GB + portion of address space from the CPU to PCIe + 0x8000_0000..0xffff_ffff. (Note: The top 64KB are actually + reserved for MSIs but this is not a problem at this point; we just + need to ensure Linux doesn't assign anything there, the M32 logic + ignores that however and will forward in that space if we try). + + * It is divided into 256 segments of equal size. A table in the chip + maps each segment to a PE#. That allows portions of the MMIO space + to be assigned to PEs on a segment granularity. For a 2GB window, + the segment granularity is 2GB/256 = 8MB. + + Now, this is the "main" window we use in Linux today (excluding + SR-IOV). We basically use the trick of forcing the bridge MMIO windows + onto a segment alignment/granularity so that the space behind a bridge + can be assigned to a PE. + + Ideally we would like to be able to have individual functions in PEs + but that would mean using a completely different address allocation + scheme where individual function BARs can be "grouped" to fit in one or + more segments. + + - The M64 windows: + + * Must be at least 256MB in size. + + * Do not translate addresses (the address on PCIe is the same as the + address on the PowerBus). There is a way to also set the top 14 + bits which are not conveyed by PowerBus but we don't use this. + + * Can be configured to be segmented. When not segmented, we can + specify the PE# for the entire window. When segmented, a window + has 256 segments; however, there is no table for mapping a segment + to a PE#. The segment number *is* the PE#. + + * Support overlaps. If an address is covered by multiple windows, + there's a defined ordering for which window applies. + + We have code (fairly new compared to the M32 stuff) that exploits that + for large BARs in 64-bit space: + + We configure an M64 window to cover the entire region of address space + that has been assigned by FW for the PHB (about 64GB, ignore the space + for the M32, it comes out of a different "reserve"). We configure it + as segmented. + + Then we do the same thing as with M32, using the bridge alignment + trick, to match to those giant segments. + + Since we cannot remap, we have two additional constraints: + + - We do the PE# allocation *after* the 64-bit space has been assigned + because the addresses we use directly determine the PE#. We then + update the M32 PE# for the devices that use both 32-bit and 64-bit + spaces or assign the remaining PE# to 32-bit only devices. + + - We cannot "group" segments in HW, so if a device ends up using more + than one segment, we end up with more than one PE#. There is a HW + mechanism to make the freeze state cascade to "companion" PEs but + that only works for PCIe error messages (typically used so that if + you freeze a switch, it freezes all its children). So we do it in + SW. We lose a bit of effectiveness of EEH in that case, but that's + the best we found. So when any of the PEs freezes, we freeze the + other ones for that "domain". We thus introduce the concept of + "master PE" which is the one used for DMA, MSIs, etc., and "secondary + PEs" that are used for the remaining M64 segments. + + We would like to investigate using additional M64 windows in "single + PE" mode to overlay over specific BARs to work around some of that, for + example for devices with very large BARs, e.g., GPUs. It would make + sense, but we haven't done it yet. + +3. Considerations for SR-IOV on PowerKVM + + * SR-IOV Background + + The PCIe SR-IOV feature allows a single Physical Function (PF) to + support several Virtual Functions (VFs). Registers in the PF's SR-IOV + Capability control the number of VFs and whether they are enabled. + + When VFs are enabled, they appear in Configuration Space like normal + PCI devices, but the BARs in VF config space headers are unusual. For + a non-VF device, software uses BARs in the config space header to + discover the BAR sizes and assign addresses for them. For VF devices, + software uses VF BAR registers in the *PF* SR-IOV Capability to + discover sizes and assign addresses. The BARs in the VF's config space + header are read-only zeros. + + When a VF BAR in the PF SR-IOV Capability is programmed, it sets the + base address for all the corresponding VF(n) BARs. For example, if the + PF SR-IOV Capability is programmed to enable eight VFs, and it has a + 1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region. + This region is divided into eight contiguous 1MB regions, each of which + is a BAR0 for one of the VFs. Note that even though the VF BAR + describes an 8MB region, the alignment requirement is for a single VF, + i.e., 1MB in this example. + + There are several strategies for isolating VFs in PEs: + + - M32 window: There's one M32 window, and it is split into 256 + equally-sized segments. The finest granularity possible is a 256MB + window with 1MB segments. VF BARs that are 1MB or larger could be + mapped to separate PEs in this window. Each segment can be + individually mapped to a PE via the lookup table, so this is quite + flexible, but it works best when all the VF BARs are the same size. If + they are different sizes, the entire window has to be small enough that + the segment size matches the smallest VF BAR, which means larger VF + BARs span several segments. + + - Non-segmented M64 window: A non-segmented M64 window is mapped entirely + to a single PE, so it could only isolate one VF. + + - Single segmented M64 windows: A segmented M64 window could be used just + like the M32 window, but the segments can't be individually mapped to + PEs (the segment number is the PE#), so there isn't as much + flexibility. A VF with multiple BARs would have to be in a "domain" of + multiple PEs, which is not as well isolated as a single PE. + + - Multiple segmented M64 windows: As usual, each window is split into 256 + equally-sized segments, and the segment number is the PE#. But if we + use several M64 windows, they can be set to different base addresses + and different segment sizes. If we have VFs that each have a 1MB BAR + and a 32MB BAR, we could use one M64 window to assign 1MB segments and + another M64 window to assign 32MB segments. + + Finally, the plan to use M64 windows for SR-IOV, which will be described + more in the next two sections. For a given VF BAR, we need to + effectively reserve the entire 256 segments (256 * VF BAR size) and + position the VF BAR to start at the beginning of a free range of + segments/PEs inside that M64 window. + + The goal is of course to be able to give a separate PE for each VF. + + The IODA2 platform has 16 M64 windows, which are used to map MMIO + range to PE#. Each M64 window defines one MMIO range and this range is + divided into 256 segments, with each segment corresponding to one PE. + + We decide to leverage this M64 window to map VFs to individual PEs, since + SR-IOV VF BARs are all the same size. + + But doing so introduces another problem: total_VFs is usually smaller + than the number of M64 window segments, so if we map one VF BAR directly + to one M64 window, some part of the M64 window will map to another + device's MMIO range. + + IODA supports 256 PEs, so segmented windows contain 256 segments, so if + total_VFs is less than 256, we have the situation in Figure 1.0, where + segments [total_VFs, 255] of the M64 window may map to some MMIO range on + other devices: + + 0 1 total_VFs - 1 + +------+------+- -+------+------+ + | | | ... | | | + +------+------+- -+------+------+ + + VF(n) BAR space + + 0 1 total_VFs - 1 255 + +------+------+- -+------+------+- -+------+------+ + | | | ... | | | ... | | | + +------+------+- -+------+------+- -+------+------+ + + M64 window + + Figure 1.0 Direct map VF(n) BAR space + + Our current solution is to allocate 256 segments even if the VF(n) BAR + space doesn't need that much, as shown in Figure 1.1: + + 0 1 total_VFs - 1 255 + +------+------+- -+------+------+- -+------+------+ + | | | ... | | | ... | | | + +------+------+- -+------+------+- -+------+------+ + + VF(n) BAR space + extra + + 0 1 total_VFs - 1 255 + +------+------+- -+------+------+- -+------+------+ + | | | ... | | | ... | | | + +------+------+- -+------+------+- -+------+------+ + + M64 window + + Figure 1.1 Map VF(n) BAR space + extra + + Allocating the extra space ensures that the entire M64 window will be + assigned to this one SR-IOV device and none of the space will be + available for other devices. Note that this only expands the space + reserved in software; there are still only total_VFs VFs, and they only + respond to segments [0, total_VFs - 1]. There's nothing in hardware that + responds to segments [total_VFs, 255]. + +4. Implications for the Generic PCI Code + +The PCIe SR-IOV spec requires that the base of the VF(n) BAR space be +aligned to the size of an individual VF BAR. + +In IODA2, the MMIO address determines the PE#. If the address is in an M32 +window, we can set the PE# by updating the table that translates segments +to PE#s. Similarly, if the address is in an unsegmented M64 window, we can +set the PE# for the window. But if it's in a segmented M64 window, the +segment number is the PE#. + +Therefore, the only way to control the PE# for a VF is to change the base +of the VF(n) BAR space in the VF BAR. If the PCI core allocates the exact +amount of space required for the VF(n) BAR space, the VF BAR value is fixed +and cannot be changed. + +On the other hand, if the PCI core allocates additional space, the VF BAR +value can be changed as long as the entire VF(n) BAR space remains inside +the space allocated by the core. + +Ideally the segment size will be the same as an individual VF BAR size. +Then each VF will be in its own PE. The VF BARs (and therefore the PE#s) +are contiguous. If VF0 is in PE(x), then VF(n) is in PE(x+n). If we +allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0. + +If the segment size is smaller than the VF BAR size, it will take several +segments to cover a VF BAR, and a VF will be in several PEs. This is +possible, but the isolation isn't as good, and it reduces the number of PE# +choices because instead of consuming only numVFs segments, the VF(n) BAR +space will consume (numVFs * n) segments. That means there aren't as many +available segments for adjusting base of the VF(n) BAR space. diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 0be7d9e13189..1e27d6338565 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -79,6 +79,9 @@ struct iommu_table { struct iommu_group *it_group; #endif void (*set_bypass)(struct iommu_table *tbl, bool enable); +#ifdef CONFIG_PPC_POWERNV + void *data; +#endif }; /* Pure 2^n version of get_order */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 5c19ac527a8e..ef8899432ae7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -236,6 +236,11 @@ struct machdep_calls { /* Called after scan and before resource survey */ void (*pcibios_fixup_phb)(struct pci_controller *hose); +#ifdef CONFIG_PCI_IOV + void (*pcibios_fixup_sriov)(struct pci_dev *pdev); + resource_size_t (*pcibios_iov_resource_alignment)(struct pci_dev *, int resno); +#endif /* CONFIG_PCI_IOV */ + /* Called to shutdown machine specific hardware not already controlled * by other drivers. */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 7d972bc85638..1811c44bf34b 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -175,6 +175,7 @@ struct iommu_table; struct pci_dn { int flags; +#define PCI_DN_FLAG_IOV_VF 0x01 int busno; /* pci bus number */ int devfn; /* pci device and function number */ @@ -189,13 +190,21 @@ struct pci_dn { int pci_ext_config_space; /* for pci devices */ - struct pci_dev *pcidev; /* back-pointer to the pci device */ #ifdef CONFIG_EEH struct eeh_dev *edev; /* eeh device */ #endif #define IODA_INVALID_PE (-1) #ifdef CONFIG_PPC_POWERNV int pe_number; +#ifdef CONFIG_PCI_IOV + u16 vfs_expanded; /* number of VFs IOV BAR expanded */ + u16 num_vfs; /* number of VFs enabled*/ + int offset; /* PE# for the first VF PE */ +#define M64_PER_IOV 4 + int m64_per_iov; +#define IODA_INVALID_M64 (-1) + int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV]; +#endif /* CONFIG_PCI_IOV */ #endif struct list_head child_list; struct list_head list; @@ -207,6 +216,8 @@ struct pci_dn { extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, int devfn); extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); +extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); +extern void remove_dev_pci_data(struct pci_dev *pdev); extern void *update_dn_pci_info(struct device_node *dn, void *data); static inline int pci_device_from_OF_node(struct device_node *np, diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 9052b4fbc41f..0d054068a21d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -134,6 +134,16 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev) pci_reset_secondary_bus(dev); } +#ifdef CONFIG_PCI_IOV +resource_size_t pcibios_iov_resource_alignment(struct pci_dev *pdev, int resno) +{ + if (ppc_md.pcibios_iov_resource_alignment) + return ppc_md.pcibios_iov_resource_alignment(pdev, resno); + + return pci_iov_resource_size(pdev, resno); +} +#endif /* CONFIG_PCI_IOV */ + static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 @@ -792,6 +802,10 @@ static void pcibios_fixup_resources(struct pci_dev *dev) pci_name(dev)); return; } + + if (dev->is_virtfn) + return; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { struct resource *res = dev->resource + i; struct pci_bus_region reg; @@ -995,6 +1009,12 @@ int pcibios_add_device(struct pci_dev *dev) */ if (dev->bus->is_added) pcibios_setup_device(dev); + +#ifdef CONFIG_PCI_IOV + if (ppc_md.pcibios_fixup_sriov) + ppc_md.pcibios_fixup_sriov(dev); +#endif /* CONFIG_PCI_IOV */ + return 0; } diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 65b98367005c..b3b4df91b792 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -136,6 +136,135 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) return NULL; } +#ifdef CONFIG_PCI_IOV +static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, + struct pci_dev *pdev, + int busno, int devfn) +{ + struct pci_dn *pdn; + + /* Except PHB, we always have the parent */ + if (!parent) + return NULL; + + pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); + if (!pdn) { + dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__); + return NULL; + } + + pdn->phb = parent->phb; + pdn->parent = parent; + pdn->busno = busno; + pdn->devfn = devfn; +#ifdef CONFIG_PPC_POWERNV + pdn->pe_number = IODA_INVALID_PE; +#endif + INIT_LIST_HEAD(&pdn->child_list); + INIT_LIST_HEAD(&pdn->list); + list_add_tail(&pdn->list, &parent->child_list); + + /* + * If we already have PCI device instance, lets + * bind them. + */ + if (pdev) + pdev->dev.archdata.pci_data = pdn; + + return pdn; +} +#endif + +struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) +{ +#ifdef CONFIG_PCI_IOV + struct pci_dn *parent, *pdn; + int i; + + /* Only support IOV for now */ + if (!pdev->is_physfn) + return pci_get_pdn(pdev); + + /* Check if VFs have been populated */ + pdn = pci_get_pdn(pdev); + if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) + return NULL; + + pdn->flags |= PCI_DN_FLAG_IOV_VF; + parent = pci_bus_to_pdn(pdev->bus); + if (!parent) + return NULL; + + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { + pdn = add_one_dev_pci_data(parent, NULL, + pci_iov_virtfn_bus(pdev, i), + pci_iov_virtfn_devfn(pdev, i)); + if (!pdn) { + dev_warn(&pdev->dev, "%s: Cannot create firmware data for VF#%d\n", + __func__, i); + return NULL; + } + } +#endif /* CONFIG_PCI_IOV */ + + return pci_get_pdn(pdev); +} + +void remove_dev_pci_data(struct pci_dev *pdev) +{ +#ifdef CONFIG_PCI_IOV + struct pci_dn *parent; + struct pci_dn *pdn, *tmp; + int i; + + /* + * VF and VF PE are created/released dynamically, so we need to + * bind/unbind them. Otherwise the VF and VF PE would be mismatched + * when re-enabling SR-IOV. + */ + if (pdev->is_virtfn) { + pdn = pci_get_pdn(pdev); +#ifdef CONFIG_PPC_POWERNV + pdn->pe_number = IODA_INVALID_PE; +#endif + return; + } + + /* Only support IOV PF for now */ + if (!pdev->is_physfn) + return; + + /* Check if VFs have been populated */ + pdn = pci_get_pdn(pdev); + if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF)) + return; + + pdn->flags &= ~PCI_DN_FLAG_IOV_VF; + parent = pci_bus_to_pdn(pdev->bus); + if (!parent) + return; + + /* + * We might introduce flag to pci_dn in future + * so that we can release VF's firmware data in + * a batch mode. + */ + for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { + list_for_each_entry_safe(pdn, tmp, + &parent->child_list, list) { + if (pdn->busno != pci_iov_virtfn_bus(pdev, i) || + pdn->devfn != pci_iov_virtfn_devfn(pdev, i)) + continue; + + if (!list_empty(&pdn->list)) + list_del(&pdn->list); + + kfree(pdn); + } + } +#endif /* CONFIG_PCI_IOV */ +} + /* * Traverse_func that inits the PCI fields of the device node. * NOTE: this *must* be done before read/write config to the device. diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5e917753c672..920c252d1f49 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -44,6 +44,9 @@ #include "powernv.h" #include "pci.h" +/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) + static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { @@ -56,11 +59,18 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, vaf.fmt = fmt; vaf.va = &args; - if (pe->pdev) + if (pe->flags & PNV_IODA_PE_DEV) strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); - else + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) sprintf(pfix, "%04x:%02x ", pci_domain_nr(pe->pbus), pe->pbus->number); +#ifdef CONFIG_PCI_IOV + else if (pe->flags & PNV_IODA_PE_VF) + sprintf(pfix, "%04x:%02x:%2x.%d", + pci_domain_nr(pe->parent_dev->bus), + (pe->rid & 0xff00) >> 8, + PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); +#endif /* CONFIG_PCI_IOV*/ printk("%spci %s: [PE# %.3d] %pV", level, pfix, pe->pe_number, &vaf); @@ -591,7 +601,7 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, bool is_add) { struct pnv_ioda_pe *slave; - struct pci_dev *pdev; + struct pci_dev *pdev = NULL; int ret; /* @@ -630,8 +640,12 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) pdev = pe->pbus->self; - else + else if (pe->flags & PNV_IODA_PE_DEV) pdev = pe->pdev->bus->self; +#ifdef CONFIG_PCI_IOV + else if (pe->flags & PNV_IODA_PE_VF) + pdev = pe->parent_dev->bus->self; +#endif /* CONFIG_PCI_IOV */ while (pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *parent; @@ -649,6 +663,87 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, return 0; } +#ifdef CONFIG_PCI_IOV +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + int64_t rc; + long rid_end, rid; + + /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + if (pe->flags & PNV_IODA_PE_BUS_ALL) + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + else + count = 1; + + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", + count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { + if (pe->flags & PNV_IODA_PE_VF) + parent = pe->parent_dev; + else + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* Clear the reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = 0; + + /* Release from all parents PELT-V */ + while (parent) { + struct pci_dn *pdn = pci_get_pdn(parent); + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + + opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Disassociate PE in PELT */ + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_UNMAP_PE); + if (rc) + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + + pe->pbus = NULL; + pe->pdev = NULL; + pe->parent_dev = NULL; + + return 0; +} +#endif /* CONFIG_PCI_IOV */ + static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; @@ -675,15 +770,19 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) case 16: bcomp = OpalPciBus4Bits; break; case 32: bcomp = OpalPciBus3Bits; break; default: - pr_err("%s: Number of subordinate busses %d" - " unsupported\n", - pci_name(pe->pbus->self), count); + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", + count); /* Do an exact match only */ bcomp = OpalPciBusAll; } rid_end = pe->rid + (count << 8); } else { - parent = pe->pdev->bus->self; +#ifdef CONFIG_PCI_IOV + if (pe->flags & PNV_IODA_PE_VF) + parent = pe->parent_dev; + else +#endif /* CONFIG_PCI_IOV */ + parent = pe->pdev->bus->self; bcomp = OpalPciBusAll; dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; @@ -774,6 +873,78 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) return 10; } +#ifdef CONFIG_PCI_IOV +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) +{ + struct pci_dn *pdn = pci_get_pdn(dev); + int i; + struct resource *res, res2; + resource_size_t size; + u16 num_vfs; + + if (!dev->is_physfn) + return -EINVAL; + + /* + * "offset" is in VFs. The M64 windows are sized so that when they + * are segmented, each segment is the same size as the IOV BAR. + * Each segment is in a separate PE, and the high order bits of the + * address are the PE number. Therefore, each VF's BAR is in a + * separate PE, and changing the IOV BAR start address changes the + * range of PEs the VFs are in. + */ + num_vfs = pdn->num_vfs; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + /* + * The actual IOV BAR range is determined by the start address + * and the actual size for num_vfs VFs BAR. This check is to + * make sure that after shifting, the range will not overlap + * with another device. + */ + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2.flags = res->flags; + res2.start = res->start + (size * offset); + res2.end = res2.start + (size * num_vfs) - 1; + + if (res2.end > res->end) { + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + return -EBUSY; + } + } + + /* + * After doing so, there would be a "hole" in the /proc/iomem when + * offset is a positive value. It looks like the device return some + * mmio back to the system, which actually no one could use it. + */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2 = *res; + res->start += size * offset; + + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + } + return 0; +} +#endif /* CONFIG_PCI_IOV */ + #if 0 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { @@ -857,7 +1028,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) pci_name(dev)); continue; } - pdn->pcidev = dev; pdn->pe_number = pe->pe_number; pe->dma_weight += pnv_ioda_dma_weight(dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) @@ -916,6 +1086,10 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) return; } + pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), + GFP_KERNEL, hose->node); + pe->tce32_table->data = pe; + /* Associate it with all child devices */ pnv_ioda_setup_same_PE(bus, pe); @@ -974,6 +1148,441 @@ static void pnv_pci_ioda_setup_PEs(void) } } +#ifdef CONFIG_PCI_IOV +static int pnv_pci_vf_release_m64(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + int i, j; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + for (j = 0; j < M64_PER_IOV; j++) { + if (pdn->m64_wins[i][j] == IODA_INVALID_M64) + continue; + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); + clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); + pdn->m64_wins[i][j] = IODA_INVALID_M64; + } + + return 0; +} + +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + unsigned int win; + struct resource *res; + int i, j; + int64_t rc; + int total_vfs; + resource_size_t size, start; + int pe_num; + int vf_groups; + int vf_per_group; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + total_vfs = pci_sriov_get_totalvfs(pdev); + + /* Initialize the m64_wins to IODA_INVALID_M64 */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + for (j = 0; j < M64_PER_IOV; j++) + pdn->m64_wins[i][j] = IODA_INVALID_M64; + + if (pdn->m64_per_iov == M64_PER_IOV) { + vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; + vf_per_group = (num_vfs <= M64_PER_IOV)? 1: + roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; + } else { + vf_groups = 1; + vf_per_group = 1; + } + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + for (j = 0; j < vf_groups; j++) { + do { + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, + phb->ioda.m64_bar_idx + 1, 0); + + if (win >= phb->ioda.m64_bar_idx + 1) + goto m64_failed; + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + + pdn->m64_wins[i][j] = win; + + if (pdn->m64_per_iov == M64_PER_IOV) { + size = pci_iov_resource_size(pdev, + PCI_IOV_RESOURCES + i); + size = size * vf_per_group; + start = res->start + size * j; + } else { + size = resource_size(res); + start = res->start; + } + + /* Map the M64 here */ + if (pdn->m64_per_iov == M64_PER_IOV) { + pe_num = pdn->offset + j; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe_num, OPAL_M64_WINDOW_TYPE, + pdn->m64_wins[i][j], 0); + } + + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + pdn->m64_wins[i][j], + start, + 0, /* unused */ + size); + + + if (rc != OPAL_SUCCESS) { + dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", + win, rc); + goto m64_failed; + } + + if (pdn->m64_per_iov == M64_PER_IOV) + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); + else + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); + + if (rc != OPAL_SUCCESS) { + dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", + win, rc); + goto m64_failed; + } + } + } + return 0; + +m64_failed: + pnv_pci_vf_release_m64(pdev); + return -EBUSY; +} + +static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct iommu_table *tbl; + unsigned long addr; + int64_t rc; + + bus = dev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + tbl = pe->tce32_table; + addr = tbl->it_base; + + opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + pe->pe_number << 1, 1, __pa(addr), + 0, 0x1000); + + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + (pe->pe_number << 1) + 1, + pe->tce_bypass_base, + 0); + if (rc) + pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + + iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); + free_pages(addr, get_order(TCE32_TABLE_SIZE)); + pe->tce32_table = NULL; +} + +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *pe_n; + struct pci_dn *pdn; + u16 vf_index; + int64_t rc; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (!pdev->is_physfn) + return; + + if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { + int vf_group; + int vf_per_group; + int vf_index1; + + vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; + + for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) + for (vf_index = vf_group * vf_per_group; + vf_index < (vf_group + 1) * vf_per_group && + vf_index < num_vfs; + vf_index++) + for (vf_index1 = vf_group * vf_per_group; + vf_index1 < (vf_group + 1) * vf_per_group && + vf_index1 < num_vfs; + vf_index1++){ + + rc = opal_pci_set_peltv(phb->opal_id, + pdn->offset + vf_index, + pdn->offset + vf_index1, + OPAL_REMOVE_PE_FROM_DOMAIN); + + if (rc) + dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", + __func__, + pdn->offset + vf_index1, rc); + } + } + + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { + if (pe->parent_dev != pdev) + continue; + + pnv_pci_ioda2_release_dma_pe(pdev, pe); + + /* Remove from list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_ioda_deconfigure_pe(phb, pe); + + pnv_ioda_free_pe(phb, pe->pe_number); + } +} + +void pnv_pci_sriov_disable(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + struct pci_sriov *iov; + u16 num_vfs; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + iov = pdev->sriov; + num_vfs = pdn->num_vfs; + + /* Release VF PEs */ + pnv_ioda_release_vf_PE(pdev, num_vfs); + + if (phb->type == PNV_PHB_IODA2) { + if (pdn->m64_per_iov == 1) + pnv_pci_vf_resource_shift(pdev, -pdn->offset); + + /* Release M64 windows */ + pnv_pci_vf_release_m64(pdev); + + /* Release PE numbers */ + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->offset = 0; + } +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe); +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + int pe_num; + u16 vf_index; + struct pci_dn *pdn; + int64_t rc; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (!pdev->is_physfn) + return; + + /* Reserve PE for each VF */ + for (vf_index = 0; vf_index < num_vfs; vf_index++) { + pe_num = pdn->offset + vf_index; + + pe = &phb->ioda.pe_array[pe_num]; + pe->pe_number = pe_num; + pe->phb = phb; + pe->flags = PNV_IODA_PE_VF; + pe->pbus = NULL; + pe->parent_dev = pdev; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | + pci_iov_virtfn_devfn(pdev, vf_index); + + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", + hose->global_number, pdev->bus->number, + PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), + PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pdev = NULL; + continue; + } + + pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), + GFP_KERNEL, hose->node); + pe->tce32_table->data = pe; + + /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_add_tail(&pe->list, &phb->ioda.pe_list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } + + if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { + int vf_group; + int vf_per_group; + int vf_index1; + + vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; + + for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { + for (vf_index = vf_group * vf_per_group; + vf_index < (vf_group + 1) * vf_per_group && + vf_index < num_vfs; + vf_index++) { + for (vf_index1 = vf_group * vf_per_group; + vf_index1 < (vf_group + 1) * vf_per_group && + vf_index1 < num_vfs; + vf_index1++) { + + rc = opal_pci_set_peltv(phb->opal_id, + pdn->offset + vf_index, + pdn->offset + vf_index1, + OPAL_ADD_PE_TO_DOMAIN); + + if (rc) + dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", + __func__, + pdn->offset + vf_index1, rc); + } + } + } + } +} + +int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + int ret; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (phb->type == PNV_PHB_IODA2) { + /* Calculate available PE for required VFs */ + mutex_lock(&phb->ioda.pe_alloc_mutex); + pdn->offset = bitmap_find_next_zero_area( + phb->ioda.pe_alloc, phb->ioda.total_pe, + 0, num_vfs, 0); + if (pdn->offset >= phb->ioda.total_pe) { + mutex_unlock(&phb->ioda.pe_alloc_mutex); + dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); + pdn->offset = 0; + return -EBUSY; + } + bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->num_vfs = num_vfs; + mutex_unlock(&phb->ioda.pe_alloc_mutex); + + /* Assign M64 window accordingly */ + ret = pnv_pci_vf_assign_m64(pdev, num_vfs); + if (ret) { + dev_info(&pdev->dev, "Not enough M64 window resources\n"); + goto m64_failed; + } + + /* + * When using one M64 BAR to map one IOV BAR, we need to shift + * the IOV BAR according to the PE# allocated to the VFs. + * Otherwise, the PE# for the VF will conflict with others. + */ + if (pdn->m64_per_iov == 1) { + ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); + if (ret) + goto m64_failed; + } + } + + /* Setup VF PEs */ + pnv_ioda_setup_vf_PE(pdev, num_vfs); + + return 0; + +m64_failed: + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->offset = 0; + + return ret; +} + +int pcibios_sriov_disable(struct pci_dev *pdev) +{ + pnv_pci_sriov_disable(pdev); + + /* Release PCI data */ + remove_dev_pci_data(pdev); + return 0; +} + +int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + /* Allocate PCI data */ + add_dev_pci_data(pdev); + + pnv_pci_sriov_enable(pdev, num_vfs); + return 0; +} +#endif /* CONFIG_PCI_IOV */ + static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); @@ -989,7 +1598,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table); + set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); } static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, @@ -1016,7 +1625,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, } else { dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(&pdev->dev, &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, &pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->tce32_table); } *pdev->dev.dma_mask = dma_mask; return 0; @@ -1053,9 +1662,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, list_for_each_entry(dev, &bus->devices, bus_list) { if (add_to_iommu_group) set_iommu_table_base_and_group(&dev->dev, - &pe->tce32_table); + pe->tce32_table); else - set_iommu_table_base(&dev->dev, &pe->tce32_table); + set_iommu_table_base(&dev->dev, pe->tce32_table); if (dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate, @@ -1145,8 +1754,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, __be64 *startp, __be64 *endp, bool rm) { - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); + struct pnv_ioda_pe *pe = tbl->data; struct pnv_phb *phb = pe->phb; if (phb->type == PNV_PHB_IODA1) @@ -1167,9 +1775,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, int64_t rc; void *addr; - /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ -#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) - /* XXX FIXME: Handle 64-bit only DMA devices */ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ /* XXX FIXME: Allocate multi-level tables on PHB3 */ @@ -1212,7 +1817,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = &pe->tce32_table; + tbl = pe->tce32_table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base << 28, IOMMU_PAGE_SHIFT_4K); @@ -1232,12 +1837,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, TCE_PCI_SWINV_PAIR); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - if (pe->pdev) + if (pe->flags & PNV_IODA_PE_DEV) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); set_iommu_table_base_and_group(&pe->pdev->dev, tbl); - else + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + } else if (pe->flags & PNV_IODA_PE_VF) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + } return; fail: @@ -1250,8 +1862,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) { - struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe, - tce32_table); + struct pnv_ioda_pe *pe = tbl->data; uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1296,10 +1907,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, pe->tce_bypass_base = 1ull << 59; /* Install set_bypass callback for VFIO */ - pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass; + pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(&pe->tce32_table, true); + pnv_pci_ioda2_set_bypass(pe->tce32_table, true); } static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, @@ -1347,7 +1958,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = &pe->tce32_table; + tbl = pe->tce32_table; pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, IOMMU_PAGE_SHIFT_4K); @@ -1365,12 +1976,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); } iommu_init_table(tbl, phb->hose->node); - iommu_register_group(tbl, phb->hose->global_number, pe->pe_number); - if (pe->pdev) + if (pe->flags & PNV_IODA_PE_DEV) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); set_iommu_table_base_and_group(&pe->pdev->dev, tbl); - else + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + } else if (pe->flags & PNV_IODA_PE_VF) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + } /* Also create a bypass window */ if (!pnv_iommu_bypass_disabled) @@ -1731,6 +2349,73 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } #endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_PCI_IOV +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct resource *res; + int i; + resource_size_t size; + struct pci_dn *pdn; + int mul, total_vfs; + + if (!pdev->is_physfn || pdev->is_added) + return; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + + pdn = pci_get_pdn(pdev); + pdn->vfs_expanded = 0; + + total_vfs = pci_sriov_get_totalvfs(pdev); + pdn->m64_per_iov = 1; + mul = phb->ioda.total_pe; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_mem_pref_64(res->flags)) { + dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", + i, res); + continue; + } + + size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + + /* bigger than 64M */ + if (size > (1 << 26)) { + dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", + i, res); + pdn->m64_per_iov = M64_PER_IOV; + mul = roundup_pow_of_two(total_vfs); + break; + } + } + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_mem_pref_64(res->flags)) { + dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n", + i, res); + continue; + } + + dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); + size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + res->end = res->start + size * mul - 1; + dev_dbg(&pdev->dev, " %pR\n", res); + dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", + i, res, mul); + } + pdn->vfs_expanded = mul; +} +#endif /* CONFIG_PCI_IOV */ + /* * This function is supposed to be called on basis of PE from top * to bottom style. So the the I/O or MMIO segment assigned to @@ -1908,6 +2593,25 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, return phb->ioda.io_segsize; } +#ifdef CONFIG_PCI_IOV +static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, + int resno) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + resource_size_t align, iov_align; + + iov_align = resource_size(&pdev->resource[resno]); + if (iov_align) + return iov_align; + + align = pci_iov_resource_size(pdev, resno); + if (pdn->vfs_expanded) + return pdn->vfs_expanded * align; + + return align; +} +#endif /* CONFIG_PCI_IOV */ + /* Prevent enabling devices for which we couldn't properly * assign a PE */ @@ -1993,6 +2697,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->hub_id = hub_id; phb->opal_id = phb_id; phb->type = ioda_type; + mutex_init(&phb->ioda.pe_alloc_mutex); /* Detect specific models for error handling */ if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) @@ -2052,6 +2757,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); + mutex_init(&phb->ioda.pe_list_mutex); /* Calculate how many 32-bit TCE segments we have */ phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; @@ -2106,6 +2812,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook; pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; + +#ifdef CONFIG_PCI_IOV + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; + ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; +#endif + pci_add_flags(PCI_REASSIGN_ALL_RSRC); /* Reset IODA tables to a clean state */ diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index fa96aa8aa1e2..bca2aeb6e4b6 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -666,6 +666,24 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; +#ifdef CONFIG_PCI_IOV + struct pnv_ioda_pe *pe; + struct pci_dn *pdn; + + /* Fix the VF pdn PE number */ + if (pdev->is_virtfn) { + pdn = pci_get_pdn(pdev); + WARN_ON(pdn->pe_number != IODA_INVALID_PE); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (pe->rid == ((pdev->bus->number << 8) | + (pdev->devfn & 0xff))) { + pdn->pe_number = pe->pe_number; + pe->pdev = pdev; + break; + } + } + } +#endif /* CONFIG_PCI_IOV */ if (phb && phb->dma_dev_setup) phb->dma_dev_setup(phb, pdev); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 1f0cb66133a1..070ee888fc95 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -23,6 +23,7 @@ enum pnv_phb_model { #define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */ #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ +#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; @@ -34,6 +35,9 @@ struct pnv_ioda_pe { * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. */ +#ifdef CONFIG_PCI_IOV + struct pci_dev *parent_dev; +#endif struct pci_dev *pdev; struct pci_bus *pbus; @@ -53,7 +57,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ int tce32_seg; int tce32_segcount; - struct iommu_table tce32_table; + struct iommu_table *tce32_table; phys_addr_t tce_inval_reg_phys; /* 64-bit TCE bypass region */ @@ -145,6 +149,8 @@ struct pnv_phb { /* PE allocation bitmap */ unsigned long *pe_alloc; + /* PE allocation mutex */ + struct mutex pe_alloc_mutex; /* M32 & IO segment maps */ unsigned int *m32_segmap; @@ -159,6 +165,7 @@ struct pnv_phb { * on the sequence of creation */ struct list_head pe_list; + struct mutex pe_list_mutex; /* Reverse map of PEs, will have to extend if * we are to support more than 256 PEs, indexed diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 4b3a4eaad996..ee0ebff103a4 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -19,16 +19,59 @@ #define VIRTFN_ID_LEN 16 -static inline u8 virtfn_bus(struct pci_dev *dev, int id) +int pci_iov_virtfn_bus(struct pci_dev *dev, int vf_id) { + if (!dev->is_physfn) + return -EINVAL; return dev->bus->number + ((dev->devfn + dev->sriov->offset + - dev->sriov->stride * id) >> 8); + dev->sriov->stride * vf_id) >> 8); } -static inline u8 virtfn_devfn(struct pci_dev *dev, int id) +int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id) { + if (!dev->is_physfn) + return -EINVAL; return (dev->devfn + dev->sriov->offset + - dev->sriov->stride * id) & 0xff; + dev->sriov->stride * vf_id) & 0xff; +} + +/* + * Per SR-IOV spec sec 3.3.10 and 3.3.11, First VF Offset and VF Stride may + * change when NumVFs changes. + * + * Update iov->offset and iov->stride when NumVFs is written. + */ +static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn) +{ + struct pci_sriov *iov = dev->sriov; + + pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); + pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset); + pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride); +} + +/* + * The PF consumes one bus number. NumVFs, First VF Offset, and VF Stride + * determine how many additional bus numbers will be consumed by VFs. + * + * Iterate over all valid NumVFs and calculate the maximum number of bus + * numbers that could ever be required. + */ +static inline u8 virtfn_max_buses(struct pci_dev *dev) +{ + struct pci_sriov *iov = dev->sriov; + int nr_virtfn; + u8 max = 0; + int busnr; + + for (nr_virtfn = 1; nr_virtfn <= iov->total_VFs; nr_virtfn++) { + pci_iov_set_numvfs(dev, nr_virtfn); + busnr = pci_iov_virtfn_bus(dev, nr_virtfn - 1); + if (busnr > max) + max = busnr; + } + + return max; } static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr) @@ -57,6 +100,14 @@ static void virtfn_remove_bus(struct pci_bus *physbus, struct pci_bus *virtbus) pci_remove_bus(virtbus); } +resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno) +{ + if (!dev->is_physfn) + return 0; + + return dev->sriov->barsz[resno - PCI_IOV_RESOURCES]; +} + static int virtfn_add(struct pci_dev *dev, int id, int reset) { int i; @@ -69,7 +120,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset) struct pci_bus *bus; mutex_lock(&iov->dev->sriov->lock); - bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id)); + bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id)); if (!bus) goto failed; @@ -77,7 +128,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset) if (!virtfn) goto failed0; - virtfn->devfn = virtfn_devfn(dev, id); + virtfn->devfn = pci_iov_virtfn_devfn(dev, id); virtfn->vendor = dev->vendor; pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device); pci_setup_device(virtfn); @@ -87,13 +138,12 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset) virtfn->multifunction = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = dev->resource + PCI_IOV_RESOURCES + i; + res = &dev->resource[i + PCI_IOV_RESOURCES]; if (!res->parent) continue; virtfn->resource[i].name = pci_name(virtfn); virtfn->resource[i].flags = res->flags; - size = resource_size(res); - do_div(size, iov->total_VFs); + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); virtfn->resource[i].start = res->start + size * id; virtfn->resource[i].end = virtfn->resource[i].start + size - 1; rc = request_resource(res, &virtfn->resource[i]); @@ -140,8 +190,8 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset) struct pci_sriov *iov = dev->sriov; virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus), - virtfn_bus(dev, id), - virtfn_devfn(dev, id)); + pci_iov_virtfn_bus(dev, id), + pci_iov_virtfn_devfn(dev, id)); if (!virtfn) return; @@ -170,6 +220,11 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset) pci_dev_put(dev); } +int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + return 0; +} + static int sriov_enable(struct pci_dev *dev, int nr_virtfn) { int rc; @@ -180,6 +235,8 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) struct pci_dev *pdev; struct pci_sriov *iov = dev->sriov; int bars = 0; + int bus; + int retval; if (!nr_virtfn) return 0; @@ -204,7 +261,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) nres = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { bars |= (1 << (i + PCI_IOV_RESOURCES)); - res = dev->resource + PCI_IOV_RESOURCES + i; + res = &dev->resource[i + PCI_IOV_RESOURCES]; if (res->parent) nres++; } @@ -216,8 +273,10 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) iov->offset = offset; iov->stride = stride; - if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) { - dev_err(&dev->dev, "SR-IOV: bus number out of range\n"); + bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1); + if (bus > dev->bus->busn_res.end) { + dev_err(&dev->dev, "can't enable %d VFs (bus %02x out of range of %pR)\n", + nr_virtfn, bus, &dev->bus->busn_res); return -ENOMEM; } @@ -243,7 +302,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) return rc; } - pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn); + pci_iov_set_numvfs(dev, nr_virtfn); iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE; pci_cfg_access_lock(dev); pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); @@ -254,6 +313,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn) if (nr_virtfn < initial) initial = nr_virtfn; + if ((retval = pcibios_sriov_enable(dev, initial))) { + dev_err(&dev->dev, "failure %d from pcibios_sriov_enable()\n", + retval); + return retval; + } + for (i = 0; i < initial; i++) { rc = virtfn_add(dev, i, 0); if (rc) @@ -272,7 +337,7 @@ failed: iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE); pci_cfg_access_lock(dev); pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); - pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0); + pci_iov_set_numvfs(dev, 0); ssleep(1); pci_cfg_access_unlock(dev); @@ -282,6 +347,11 @@ failed: return rc; } +int __weak pcibios_sriov_disable(struct pci_dev *pdev) +{ + return 0; +} + static void sriov_disable(struct pci_dev *dev) { int i; @@ -293,6 +363,8 @@ static void sriov_disable(struct pci_dev *dev) for (i = 0; i < iov->num_VFs; i++) virtfn_remove(dev, i, 0); + pcibios_sriov_disable(dev); + iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE); pci_cfg_access_lock(dev); pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); @@ -303,12 +375,12 @@ static void sriov_disable(struct pci_dev *dev) sysfs_remove_link(&dev->dev.kobj, "dep_link"); iov->num_VFs = 0; - pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, 0); + pci_iov_set_numvfs(dev, 0); } static int sriov_init(struct pci_dev *dev, int pos) { - int i; + int i, bar64; int rc; int nres; u32 pgsz; @@ -357,27 +429,29 @@ found: pgsz &= ~(pgsz - 1); pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz); + iov = kzalloc(sizeof(*iov), GFP_KERNEL); + if (!iov) + return -ENOMEM; + nres = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = dev->resource + PCI_IOV_RESOURCES + i; - i += __pci_read_base(dev, pci_bar_unknown, res, - pos + PCI_SRIOV_BAR + i * 4); + res = &dev->resource[i + PCI_IOV_RESOURCES]; + bar64 = __pci_read_base(dev, pci_bar_unknown, res, + pos + PCI_SRIOV_BAR + i * 4); if (!res->flags) continue; if (resource_size(res) & (PAGE_SIZE - 1)) { rc = -EIO; goto failed; } + iov->barsz[i] = resource_size(res); res->end = res->start + resource_size(res) * total - 1; + dev_info(&dev->dev, "VF(n) BAR%d space: %pR (contains BAR%d for %d VFs)\n", + i, res, i, total); + i += bar64; nres++; } - iov = kzalloc(sizeof(*iov), GFP_KERNEL); - if (!iov) { - rc = -ENOMEM; - goto failed; - } - iov->pos = pos; iov->nres = nres; iov->ctrl = ctrl; @@ -400,15 +474,17 @@ found: dev->sriov = iov; dev->is_physfn = 1; + iov->max_VF_buses = virtfn_max_buses(dev); return 0; failed: for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = dev->resource + PCI_IOV_RESOURCES + i; + res = &dev->resource[i + PCI_IOV_RESOURCES]; res->flags = 0; } + kfree(iov); return rc; } @@ -439,7 +515,7 @@ static void sriov_restore_state(struct pci_dev *dev) pci_update_resource(dev, i); pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); - pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, iov->num_VFs); + pci_iov_set_numvfs(dev, iov->num_VFs); pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl); if (iov->ctrl & PCI_SRIOV_CTRL_VFE) msleep(100); @@ -493,6 +569,12 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno) 4 * (resno - PCI_IOV_RESOURCES); } +resource_size_t __weak pcibios_iov_resource_alignment(struct pci_dev *dev, + int resno) +{ + return pci_iov_resource_size(dev, resno); +} + /** * pci_sriov_resource_alignment - get resource alignment for VF BAR * @dev: the PCI device @@ -505,14 +587,7 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno) */ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno) { - struct resource tmp; - int reg = pci_iov_resource_bar(dev, resno); - - if (!reg) - return 0; - - __pci_read_base(dev, pci_bar_unknown, &tmp, reg); - return resource_alignment(&tmp); + return pcibios_iov_resource_alignment(dev, resno); } /** @@ -535,15 +610,13 @@ void pci_restore_iov_state(struct pci_dev *dev) int pci_iov_bus_range(struct pci_bus *bus) { int max = 0; - u8 busnr; struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { if (!dev->is_physfn) continue; - busnr = virtfn_bus(dev, dev->sriov->total_VFs - 1); - if (busnr > max) - max = busnr; + if (dev->sriov->max_VF_buses > max) + max = dev->sriov->max_VF_buses; } return max ? max - bus->number : 0; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4091f82239cd..bae593c04541 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -243,10 +243,12 @@ struct pci_sriov { u16 stride; /* following VF stride */ u32 pgsz; /* page size for BAR alignment */ u8 link; /* Function Dependency Link */ + u8 max_VF_buses; /* max buses consumed by VFs */ u16 driver_max_VFs; /* max num VFs driver supports */ struct pci_dev *dev; /* lowest numbered PF */ struct pci_dev *self; /* this PF */ struct mutex lock; /* lock for VF bus */ + resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ }; #ifdef CONFIG_PCI_ATS diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index e3e17f3c0f0f..6603d401bb7c 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -99,8 +99,8 @@ static void remove_from_list(struct list_head *head, } } -static resource_size_t get_res_add_size(struct list_head *head, - struct resource *res) +static struct pci_dev_resource *res_to_dev_res(struct list_head *head, + struct resource *res) { struct pci_dev_resource *dev_res; @@ -109,17 +109,37 @@ static resource_size_t get_res_add_size(struct list_head *head, int idx = res - &dev_res->dev->resource[0]; dev_printk(KERN_DEBUG, &dev_res->dev->dev, - "res[%d]=%pR get_res_add_size add_size %llx\n", + "res[%d]=%pR res_to_dev_res add_size %llx min_align %llx\n", idx, dev_res->res, - (unsigned long long)dev_res->add_size); + (unsigned long long)dev_res->add_size, + (unsigned long long)dev_res->min_align); - return dev_res->add_size; + return dev_res; } } - return 0; + return NULL; } +static resource_size_t get_res_add_size(struct list_head *head, + struct resource *res) +{ + struct pci_dev_resource *dev_res; + + dev_res = res_to_dev_res(head, res); + return dev_res ? dev_res->add_size : 0; +} + +static resource_size_t get_res_add_align(struct list_head *head, + struct resource *res) +{ + struct pci_dev_resource *dev_res; + + dev_res = res_to_dev_res(head, res); + return dev_res ? dev_res->min_align : 0; +} + + /* Sort resources by alignment */ static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) { @@ -215,7 +235,7 @@ static void reassign_resources_sorted(struct list_head *realloc_head, struct resource *res; struct pci_dev_resource *add_res, *tmp; struct pci_dev_resource *dev_res; - resource_size_t add_size; + resource_size_t add_size, align; int idx; list_for_each_entry_safe(add_res, tmp, realloc_head, list) { @@ -238,13 +258,13 @@ static void reassign_resources_sorted(struct list_head *realloc_head, idx = res - &add_res->dev->resource[0]; add_size = add_res->add_size; + align = add_res->min_align; if (!resource_size(res)) { - res->start = add_res->start; + res->start = align; res->end = res->start + add_size - 1; if (pci_assign_resource(add_res->dev, idx)) reset_resource(res); } else { - resource_size_t align = add_res->min_align; res->flags |= add_res->flags & (IORESOURCE_STARTALIGN|IORESOURCE_SIZEALIGN); if (pci_reassign_resource(add_res->dev, idx, @@ -368,8 +388,9 @@ static void __assign_resources_sorted(struct list_head *head, LIST_HEAD(save_head); LIST_HEAD(local_fail_head); struct pci_dev_resource *save_res; - struct pci_dev_resource *dev_res, *tmp_res; + struct pci_dev_resource *dev_res, *tmp_res, *dev_res2; unsigned long fail_type; + resource_size_t add_align, align; /* Check if optional add_size is there */ if (!realloc_head || list_empty(realloc_head)) @@ -384,10 +405,44 @@ static void __assign_resources_sorted(struct list_head *head, } /* Update res in head list with add_size in realloc_head list */ - list_for_each_entry(dev_res, head, list) + list_for_each_entry_safe(dev_res, tmp_res, head, list) { dev_res->res->end += get_res_add_size(realloc_head, dev_res->res); + /* + * There are two kinds of additional resources in the list: + * 1. bridge resource -- IORESOURCE_STARTALIGN + * 2. SR-IOV resource -- IORESOURCE_SIZEALIGN + * Here just fix the additional alignment for bridge + */ + if (!(dev_res->res->flags & IORESOURCE_STARTALIGN)) + continue; + + add_align = get_res_add_align(realloc_head, dev_res->res); + + /* + * The "head" list is sorted by the alignment to make sure + * resources with bigger alignment will be assigned first. + * After we change the alignment of a dev_res in "head" list, + * we need to reorder the list by alignment to make it + * consistent. + */ + if (add_align > dev_res->res->start) { + dev_res->res->start = add_align; + dev_res->res->end = add_align + + resource_size(dev_res->res); + + list_for_each_entry(dev_res2, head, list) { + align = pci_resource_alignment(dev_res2->dev, + dev_res2->res); + if (add_align > align) + list_move_tail(&dev_res->list, + &dev_res2->list); + } + } + + } + /* Try updated head list with add_size added */ assign_requested_resources_sorted(head, &local_fail_head); @@ -962,6 +1017,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, struct resource *b_res = find_free_bus_resource(bus, mask | IORESOURCE_PREFETCH, type); resource_size_t children_add_size = 0; + resource_size_t children_add_align = 0; + resource_size_t add_align = 0; if (!b_res) return -ENOSPC; @@ -986,6 +1043,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, /* put SRIOV requested res to the optional list */ if (realloc_head && i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END) { + add_align = max(pci_resource_alignment(dev, r), add_align); r->end = r->start - 1; add_to_list(realloc_head, dev, r, r_size, 0/* don't care */); children_add_size += r_size; @@ -1016,19 +1074,23 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, if (order > max_order) max_order = order; - if (realloc_head) + if (realloc_head) { children_add_size += get_res_add_size(realloc_head, r); + children_add_align = get_res_add_align(realloc_head, r); + add_align = max(add_align, children_add_align); + } } } min_align = calculate_mem_align(aligns, max_order); min_align = max(min_align, window_alignment(bus, b_res->flags)); size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align); + add_align = max(min_align, add_align); if (children_add_size > add_size) add_size = children_add_size; size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 : calculate_memsize(size, min_size, add_size, - resource_size(b_res), min_align); + resource_size(b_res), add_align); if (!size0 && !size1) { if (b_res->start || b_res->end) dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n", @@ -1040,10 +1102,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, b_res->end = size0 + min_align - 1; b_res->flags |= IORESOURCE_STARTALIGN; if (size1 > size0 && realloc_head) { - add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align); - dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n", + add_to_list(realloc_head, bus->self, b_res, size1-size0, add_align); + dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx add_align %llx\n", b_res, &bus->busn_res, - (unsigned long long)size1-size0); + (unsigned long long) (size1 - size0), + (unsigned long long) add_align); } return 0; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 211e9da8a7d7..4e1f17db1a81 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1174,6 +1174,7 @@ unsigned char pci_bus_max_busnr(struct pci_bus *bus); void pci_setup_bridge(struct pci_bus *bus); resource_size_t pcibios_window_alignment(struct pci_bus *bus, unsigned long type); +resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno); #define PCI_VGA_STATE_CHANGE_BRIDGE (1 << 0) #define PCI_VGA_STATE_CHANGE_DECODES (1 << 1) @@ -1669,13 +1670,25 @@ int pci_ext_cfg_avail(void); void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar); #ifdef CONFIG_PCI_IOV +int pci_iov_virtfn_bus(struct pci_dev *dev, int id); +int pci_iov_virtfn_devfn(struct pci_dev *dev, int id); + int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn); void pci_disable_sriov(struct pci_dev *dev); int pci_num_vf(struct pci_dev *dev); int pci_vfs_assigned(struct pci_dev *dev); int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs); int pci_sriov_get_totalvfs(struct pci_dev *dev); +resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno); #else +static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id) +{ + return -ENOSYS; +} +static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id) +{ + return -ENOSYS; +} static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn) { return -ENODEV; } static inline void pci_disable_sriov(struct pci_dev *dev) { } @@ -1686,6 +1699,8 @@ static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs) { return 0; } static inline int pci_sriov_get_totalvfs(struct pci_dev *dev) { return 0; } +static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno) +{ return 0; } #endif #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE) |