From ef0dab4aae14e25efddf1577736f8450132800c5 Mon Sep 17 00:00:00 2001 From: David Miller Date: Sat, 18 Jun 2016 23:52:25 -0700 Subject: PCI: Fix unaligned accesses in VC code The save/restore buffers for VC state is first composed of a 2-byte control register, then a bunch of 4-byte words. This causes unaligned accesses which trap on platform such as sparc. This is easy to fix by simply moving the buffer pointer forward by 4 bytes instead of 2 after dealing with the control register. The length adjustment needs to be changed likewise as well. Fixes: 5f8fc43217a0 ("PCI: Include pci/pcie/Kconfig directly from pci/Kconfig") Reported-by: Meelis Roos Reported-by: Anatoly Pugachev Signed-off-by: David S. Miller Signed-off-by: Bjorn Helgaas CC: stable@vger.kernel.org # v4.6+ --- drivers/pci/vc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index dfbab61a1b47..1fa3a3219c45 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -221,9 +221,9 @@ static int pci_vc_do_save_buffer(struct pci_dev *dev, int pos, else pci_write_config_word(dev, pos + PCI_VC_PORT_CTRL, *(u16 *)buf); - buf += 2; + buf += 4; } - len += 2; + len += 4; /* * If we have any Low Priority VCs and a VC Arbitration Table Offset -- cgit v1.2.3 From 5eb6d660193ccc471b415d6f31e17312a63de167 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:14 +0900 Subject: PCI: Add pci_msix_desc_addr() helper Add a pci_msix_desc_addr() helper to factor out the calculation of the base address for a given MSI-X vector. Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- drivers/pci/msi.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index a080f4496fe2..0d94fbf95ba6 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -207,6 +207,12 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); } +static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) +{ + return desc->mask_base + + desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; +} + /* * This internal function does not flush PCI writes to the device. * All users must ensure that they read from the device before either @@ -217,8 +223,6 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag) { u32 mask_bits = desc->masked; - unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL; if (pci_msi_ignore_mask) return 0; @@ -226,7 +230,7 @@ u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag) mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; if (flag) mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT; - writel(mask_bits, desc->mask_base + offset); + writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL); return mask_bits; } @@ -284,8 +288,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) BUG_ON(dev->current_state != PCI_D0); if (entry->msi_attrib.is_msix) { - void __iomem *base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + void __iomem *base = pci_msix_desc_addr(entry); msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR); @@ -315,9 +318,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) if (dev->current_state != PCI_D0) { /* Don't touch the hardware now */ } else if (entry->msi_attrib.is_msix) { - void __iomem *base; - base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + void __iomem *base = pci_msix_desc_addr(entry); writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); -- cgit v1.2.3 From 12eb21de1f26233ad195ffe991697d61790f4193 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:15 +0900 Subject: PCI: Switch msix_program_entries() to use pci_msix_desc_addr() Instead of relying on the msix_entry structure for the vector number, read it from the msi_desc. Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- drivers/pci/msi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 0d94fbf95ba6..a385f39be6f7 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -713,11 +713,9 @@ static void msix_program_entries(struct pci_dev *dev, int i = 0; for_each_pci_msi_entry(entry, dev) { - int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL; - entries[i].vector = entry->irq; - entry->masked = readl(entry->mask_base + offset); + entry->masked = readl(pci_msix_desc_addr(entry) + + PCI_MSIX_ENTRY_VECTOR_CTRL); msix_mask_irq(entry, 1); i++; } -- cgit v1.2.3 From 3ac020e0ca8beff9b695f9866a06d50c09c602d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:16 +0900 Subject: PCI: Make the "entries" argument to pci_enable_msix() optional The "entries" argument isn't needed if the list of entries does not contain any holes. Make it optional so that we can avoid the need to allocate a msix_entry structure for this (common) case. Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- drivers/pci/msi.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index a385f39be6f7..98ace67c5f4d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -695,7 +695,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.is_msix = 1; entry->msi_attrib.is_64 = 1; - entry->msi_attrib.entry_nr = entries[i].entry; + if (entries) + entry->msi_attrib.entry_nr = entries[i].entry; + else + entry->msi_attrib.entry_nr = i; entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; entry->nvec_used = 1; @@ -713,11 +716,11 @@ static void msix_program_entries(struct pci_dev *dev, int i = 0; for_each_pci_msi_entry(entry, dev) { - entries[i].vector = entry->irq; + if (entries) + entries[i++].vector = entry->irq; entry->masked = readl(pci_msix_desc_addr(entry) + PCI_MSIX_ENTRY_VECTOR_CTRL); msix_mask_irq(entry, 1); - i++; } } @@ -930,7 +933,7 @@ EXPORT_SYMBOL(pci_msix_vec_count); /** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function - * @entries: pointer to an array of MSI-X entries + * @entries: pointer to an array of MSI-X entries (optional) * @nvec: number of MSI-X irqs requested for allocation by device driver * * Setup the MSI-X capability structure of device function with the number @@ -950,22 +953,21 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec) if (!pci_msi_supported(dev, nvec)) return -EINVAL; - if (!entries) - return -EINVAL; - nr_entries = pci_msix_vec_count(dev); if (nr_entries < 0) return nr_entries; if (nvec > nr_entries) return nr_entries; - /* Check for any invalid entries */ - for (i = 0; i < nvec; i++) { - if (entries[i].entry >= nr_entries) - return -EINVAL; /* invalid entry */ - for (j = i + 1; j < nvec; j++) { - if (entries[i].entry == entries[j].entry) - return -EINVAL; /* duplicate entry */ + if (entries) { + /* Check for any invalid entries */ + for (i = 0; i < nvec; i++) { + if (entries[i].entry >= nr_entries) + return -EINVAL; /* invalid entry */ + for (j = i + 1; j < nvec; j++) { + if (entries[i].entry == entries[j].entry) + return -EINVAL; /* duplicate entry */ + } } } WARN_ON(!!dev->msix_enabled); -- cgit v1.2.3 From aff171641d181ea573380efc3f559c9de4741fc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:17 +0900 Subject: PCI: Provide sensible IRQ vector alloc/free routines Add a function to allocate and free a range of interrupt vectors, using MSI-X, MSI or legacy vectors (in that order) based on the capabilities of the underlying device and PCIe complex. Additionally a new helper is provided to get the Linux IRQ number for given device-relative vector so that the drivers don't need to allocate their own arrays to keep track of the vectors for the multi vector MSI-X case. Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- Documentation/PCI/MSI-HOWTO.txt | 467 +++++++--------------------------------- drivers/pci/msi.c | 89 ++++++++ include/linux/pci.h | 27 +++ 3 files changed, 192 insertions(+), 391 deletions(-) (limited to 'drivers/pci') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1179850f453c..0ac612b8c3fb 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -78,422 +78,107 @@ CONFIG_PCI_MSI option. 4.2 Using MSI -Most of the hard work is done for the driver in the PCI layer. It simply -has to request that the PCI layer set up the MSI capability for this +Most of the hard work is done for the driver in the PCI layer. The driver +simply has to request that the PCI layer set up the MSI capability for this device. -4.2.1 pci_enable_msi +To automatically use MSI or MSI-X interrupt vectors, use the following +function: -int pci_enable_msi(struct pci_dev *dev) + int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); -A successful call allocates ONE interrupt to the device, regardless -of how many MSIs the device supports. The device is switched from -pin-based interrupt mode to MSI mode. The dev->irq number is changed -to a new number which represents the message signaled interrupt; -consequently, this function should be called before the driver calls -request_irq(), because an MSI is delivered via a vector that is -different from the vector of a pin-based interrupt. +which allocates up to max_vecs interrupt vectors for a PCI device. It +returns the number of vectors allocated or a negative error. If the device +has a requirements for a minimum number of vectors the driver can pass a +min_vecs argument set to this limit, and the PCI core will return -ENOSPC +if it can't meet the minimum number of vectors. -4.2.2 pci_enable_msi_range +The flags argument should normally be set to 0, but can be used to pass the +PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support +MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in +case the device does not support legacy interrupt lines. -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +To get the Linux IRQ numbers passed to request_irq() and free_irq() and the +vectors, use the following function: -This function allows a device driver to request any number of MSI -interrupts within specified range from 'minvec' to 'maxvec'. + int pci_irq_vector(struct pci_dev *dev, unsigned int nr); -If this function returns a positive number it indicates the number of -MSI interrupts that have been successfully allocated. In this case -the device is switched from pin-based interrupt mode to MSI mode and -updates dev->irq to be the lowest of the new interrupts assigned to it. -The other interrupts assigned to the device are in the range dev->irq -to dev->irq + returned value - 1. Device driver can use the returned -number of successfully allocated MSI interrupts to further allocate -and initialize device resources. +Any allocated resources should be freed before removing the device using +the following function: -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. + void pci_free_irq_vectors(struct pci_dev *dev); -This function should be called before the driver calls request_irq(), -because MSI interrupts are delivered via vectors that are different -from the vector of a pin-based interrupt. +If a device supports both MSI-X and MSI capabilities, this API will use the +MSI-X facilities in preference to the MSI facilities. MSI-X supports any +number of interrupts between 1 and 2048. In contrast, MSI is restricted to +a maximum of 32 interrupts (and must be a power of two). In addition, the +MSI interrupt vectors must be allocated consecutively, so the system might +not be able to allocate as many vectors for MSI as it could for MSI-X. On +some platforms, MSI interrupts must all be targeted at the same set of CPUs +whereas MSI-X interrupts can all be targeted at different CPUs. -It is ideal if drivers can cope with a variable number of MSI interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. +If a device supports neither MSI-X or MSI it will fall back to a single +legacy IRQ vector. -There could be devices that can not operate with just any number of MSI -interrupts within a range. See chapter 4.3.1.3 to get the idea how to -handle such devices for MSI-X - the same logic applies to MSI. +The typical usage of MSI or MSI-X interrupts is to allocate as many vectors +as possible, likely up to the limit supported by the device. If nvec is +larger than the number supported by the device it will automatically be +capped to the supported limit, so there is no need to query the number of +vectors supported beforehand: -4.2.1.1 Maximum possible number of MSI interrupts - -The typical usage of MSI interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msi_vec_count() function: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.2.1.2 Exact number of MSI interrupts + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, 0); + if (nvec < 0) + goto out_err; If a driver is unable or unwilling to deal with a variable number of MSI -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msi_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, nvec, nvec); -} - -Note, unlike pci_enable_msi_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msi_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msi_exact() does). - -4.2.1.3 Single MSI mode - -The most notorious example of the request type described above is -enabling the single MSI mode for a device. It could be done by passing -two 1s as 'minvec' and 'maxvec': - -static int foo_driver_enable_single_msi(struct pci_dev *pdev) -{ - return pci_enable_msi_range(pdev, 1, 1); -} - -Note, unlike pci_enable_msi() function, which could be also used to -enable the single MSI mode, pci_enable_msi_range() returns either a -negative errno or 1 (not negative errno or 0 - as pci_enable_msi() -does). - -4.2.3 pci_enable_msi_exact - -int pci_enable_msi_exact(struct pci_dev *dev, int nvec) - -This variation on pci_enable_msi_range() call allows a device driver to -request exactly 'nvec' MSIs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. - -By contrast with pci_enable_msi_range() function, pci_enable_msi_exact() -returns zero in case of success, which indicates MSI interrupts have been -successfully allocated. - -4.2.4 pci_disable_msi - -void pci_disable_msi(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msi_range(). -Calling it restores dev->irq to the pin-based interrupt number and frees -the previously allocated MSIs. The interrupts may subsequently be assigned -to another device, so drivers should not cache the value of dev->irq. - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI enabled and thus leaking its vector. - -4.2.4 pci_msi_vec_count - -int pci_msi_vec_count(struct pci_dev *dev) - -This function could be used to retrieve the number of MSI vectors the -device requested (via the Multiple Message Capable register). The MSI -specification only allows the returned value to be a power of two, -up to a maximum of 2^5 (32). - -If this function returns a negative number, it indicates the device is -not capable of sending MSIs. - -If this function returns a positive number, it indicates the maximum -number of MSI interrupt vectors that could be allocated. - -4.3 Using MSI-X - -The MSI-X capability is much more flexible than the MSI capability. -It supports up to 2048 interrupts, each of which can be controlled -independently. To support this flexibility, drivers must use an array of -`struct msix_entry': - -struct msix_entry { - u16 vector; /* kernel uses to write alloc vector */ - u16 entry; /* driver uses to specify entry */ -}; - -This allows for the device to use these interrupts in a sparse fashion; -for example, it could use interrupts 3 and 1027 and yet allocate only a -two-element array. The driver is expected to fill in the 'entry' value -in each element of the array to indicate for which entries the kernel -should assign interrupts; it is invalid to fill in two entries with the -same number. - -4.3.1 pci_enable_msix_range - -int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) - -Calling this function asks the PCI subsystem to allocate any number of -MSI-X interrupts within specified range from 'minvec' to 'maxvec'. -The 'entries' argument is a pointer to an array of msix_entry structs -which should be at least 'maxvec' entries in size. - -On success, the device is switched into MSI-X mode and the function -returns the number of MSI-X interrupts that have been successfully -allocated. In this case the 'vector' member in entries numbered from -0 to the returned value - 1 is populated with the interrupt number; -the driver should then call request_irq() for each 'vector' that it -decides to use. The device driver is responsible for keeping track of the -interrupts assigned to the MSI-X vectors so it can free them again later. -Device driver can use the returned number of successfully allocated MSI-X -interrupts to further allocate and initialize device resources. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -This function, in contrast with pci_enable_msi_range(), does not adjust -dev->irq. The device will not generate interrupts for this interrupt -number once MSI-X is enabled. - -Device drivers should normally call this function once per device -during the initialization phase. - -It is ideal if drivers can cope with a variable number of MSI-X interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. - -There could be devices that can not operate with just any number of MSI-X -interrupts within a range. E.g., an network adapter might need let's say -four vectors per each queue it provides. Therefore, a number of MSI-X -interrupts allocated should be a multiple of four. In this case interface -pci_enable_msix_range() can not be used alone to request MSI-X interrupts -(since it can allocate any number within the range, without any notion of -the multiple of four) and the device driver should master a custom logic -to request the required number of MSI-X interrupts. - -4.3.1.1 Maximum possible number of MSI-X interrupts - -The typical usage of MSI-X interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msix_vec_count() function: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI-X interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.3.1.2 Exact number of MSI-X interrupts - -If a driver is unable or unwilling to deal with a variable number of MSI-X -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msix_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - nvec, nvec); -} - -Note, unlike pci_enable_msix_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msix_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msix_exact() does). - -4.3.1.3 Specific requirements to the number of MSI-X interrupts - -As noted above, there could be devices that can not operate with just any -number of MSI-X interrupts within a range. E.g., let's assume a device that -is only capable sending the number of MSI-X interrupts which is a power of -two. A routine that enables MSI-X mode for such device might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - maxvec, maxvec); - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } - - return rc; -} - -Note how pci_enable_msix_range() return value is analyzed for a fallback - -any error code other than -ENOSPC indicates a fatal error and should not -be retried. - -4.3.2 pci_enable_msix_exact - -int pci_enable_msix_exact(struct pci_dev *dev, - struct msix_entry *entries, int nvec) - -This variation on pci_enable_msix_range() call allows a device driver to -request exactly 'nvec' MSI-Xs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -By contrast with pci_enable_msix_range() function, pci_enable_msix_exact() -returns zero in case of success, which indicates MSI-X interrupts have been -successfully allocated. - -Another version of a routine that enables MSI-X mode for a device with -specific requirements described in chapter 4.3.1.3 might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_exact(adapter->pdev, - adapter->msix_entries, maxvec); - - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } else if (rc < 0) { - return rc; - } - - return maxvec; -} - -4.3.3 pci_disable_msix - -void pci_disable_msix(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msix_range(). -It frees the previously allocated MSI-X interrupts. The interrupts may -subsequently be assigned to another device, so drivers should not cache -the value of the 'vector' elements over a call to pci_disable_msix(). - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI-X enabled and thus leaking its vector. - -4.3.3 The MSI-X Table - -The MSI-X capability specifies a BAR and offset within that BAR for the -MSI-X Table. This address is mapped by the PCI subsystem, and should not -be accessed directly by the device driver. If the driver wishes to -mask or unmask an interrupt, it should call disable_irq() / enable_irq(). +interrupts it can request a particular number of interrupts by passing that +number to pci_alloc_irq_vectors() function as both 'min_vecs' and +'max_vecs' parameters: -4.3.4 pci_msix_vec_count + ret = pci_alloc_irq_vectors(pdev, nvec, nvec, 0); + if (ret < 0) + goto out_err; -int pci_msix_vec_count(struct pci_dev *dev) +The most notorious example of the request type described above is enabling +the single MSI mode for a device. It could be done by passing two 1s as +'min_vecs' and 'max_vecs': -This function could be used to retrieve number of entries in the device -MSI-X table. + ret = pci_alloc_irq_vectors(pdev, 1, 1, 0); + if (ret < 0) + goto out_err; -If this function returns a negative number, it indicates the device is -not capable of sending MSI-Xs. +Some devices might not support using legacy line interrupts, in which case +the PCI_IRQ_NOLEGACY flag can be used to fail the request if the platform +can't provide MSI or MSI-X interrupts: -If this function returns a positive number, it indicates the maximum -number of MSI-X interrupt vectors that could be allocated. + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_NOLEGACY); + if (nvec < 0) + goto out_err; -4.4 Handling devices implementing both MSI and MSI-X capabilities +4.3 Legacy APIs -If a device implements both MSI and MSI-X capabilities, it can -run in either MSI mode or MSI-X mode, but not both simultaneously. -This is a requirement of the PCI spec, and it is enforced by the -PCI layer. Calling pci_enable_msi_range() when MSI-X is already -enabled or pci_enable_msix_range() when MSI is already enabled -results in an error. If a device driver wishes to switch between MSI -and MSI-X at runtime, it must first quiesce the device, then switch -it back to pin-interrupt mode, before calling pci_enable_msi_range() -or pci_enable_msix_range() and resuming operation. This is not expected -to be a common operation but may be useful for debugging or testing -during development. +The following old APIs to enable and disable MSI or MSI-X interrupts should +not be used in new code: -4.5 Considerations when using MSIs + pci_enable_msi() /* deprecated */ + pci_enable_msi_range() /* deprecated */ + pci_enable_msi_exact() /* deprecated */ + pci_disable_msi() /* deprecated */ + pci_enable_msix_range() /* deprecated */ + pci_enable_msix_exact() /* deprecated */ + pci_disable_msix() /* deprecated */ -4.5.1 Choosing between MSI-X and MSI +Additionally there are APIs to provide the number of supported MSI or MSI-X +vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these +should be avoided in favor of letting pci_alloc_irq_vectors() cap the +number of vectors. If you have a legitimate special use case for the count +of vectors we might have to revisit that decision and add a +pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently. -If your device supports both MSI-X and MSI capabilities, you should use -the MSI-X facilities in preference to the MSI facilities. As mentioned -above, MSI-X supports any number of interrupts between 1 and 2048. -In contrast, MSI is restricted to a maximum of 32 interrupts (and -must be a power of two). In addition, the MSI interrupt vectors must -be allocated consecutively, so the system might not be able to allocate -as many vectors for MSI as it could for MSI-X. On some platforms, MSI -interrupts must all be targeted at the same set of CPUs whereas MSI-X -interrupts can all be targeted at different CPUs. +4.4 Considerations when using MSIs -4.5.2 Spinlocks +4.4.1 Spinlocks Most device drivers have a per-device spinlock which is taken in the interrupt handler. With pin-based interrupts or a single MSI, it is not @@ -505,7 +190,7 @@ acquire the spinlock. Such deadlocks can be avoided by using spin_lock_irqsave() or spin_lock_irq() which disable local interrupts and acquire the lock (see Documentation/DocBook/kernel-locking). -4.6 How to tell whether MSI/MSI-X is enabled on a device +4.5 How to tell whether MSI/MSI-X is enabled on a device Using 'lspci -v' (as root) may show some devices with "MSI", "Message Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98ace67c5f4d..5e5ab478ea7d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -4,6 +4,7 @@ * * Copyright (C) 2003-2004 Intel * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) + * Copyright (C) 2016 Christoph Hellwig. */ #include @@ -1121,6 +1122,94 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, } EXPORT_SYMBOL(pci_enable_msix_range); +/** + * pci_alloc_irq_vectors - allocate multiple IRQs for a device + * @dev: PCI device to operate on + * @min_vecs: minimum number of vectors required (must be >= 1) + * @max_vecs: maximum (desired) number of vectors + * @flags: flags or quirks for the allocation + * + * Allocate up to @max_vecs interrupt vectors for @dev, using MSI-X or MSI + * vectors if available, and fall back to a single legacy vector + * if neither is available. Return the number of vectors allocated, + * (which might be smaller than @max_vecs) if successful, or a negative + * error code on error. If less than @min_vecs interrupt vectors are + * available for @dev the function will fail with -ENOSPC. + * + * To get the Linux IRQ number used for a vector that can be passed to + * request_irq() use the pci_irq_vector() helper. + */ +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + int vecs = -ENOSPC; + + if (!(flags & PCI_IRQ_NOMSIX)) { + vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); + if (vecs > 0) + return vecs; + } + + if (!(flags & PCI_IRQ_NOMSI)) { + vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); + if (vecs > 0) + return vecs; + } + + /* use legacy irq if allowed */ + if (!(flags & PCI_IRQ_NOLEGACY) && min_vecs == 1) + return 1; + return vecs; +} +EXPORT_SYMBOL(pci_alloc_irq_vectors); + +/** + * pci_free_irq_vectors - free previously allocated IRQs for a device + * @dev: PCI device to operate on + * + * Undoes the allocations and enabling in pci_alloc_irq_vectors(). + */ +void pci_free_irq_vectors(struct pci_dev *dev) +{ + pci_disable_msix(dev); + pci_disable_msi(dev); +} +EXPORT_SYMBOL(pci_free_irq_vectors); + +/** + * pci_irq_vector - return Linux IRQ number of a device vector + * @dev: PCI device to operate on + * @nr: device-relative interrupt vector index (0-based). + */ +int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (dev->msix_enabled) { + struct msi_desc *entry; + int i = 0; + + for_each_pci_msi_entry(entry, dev) { + if (i == nr) + return entry->irq; + i++; + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + if (dev->msi_enabled) { + struct msi_desc *entry = first_pci_msi_entry(dev); + + if (WARN_ON_ONCE(nr >= entry->nvec_used)) + return -EINVAL; + } else { + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + } + + return dev->irq + nr; +} +EXPORT_SYMBOL(pci_irq_vector); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..52ecd49e8049 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1237,6 +1237,10 @@ resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno); int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); +#define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ +#define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ +#define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ + /* kmem_cache style wrapper around pci_alloc_consistent() */ #include @@ -1284,6 +1288,11 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev, return rc; return 0; } +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); +void pci_free_irq_vectors(struct pci_dev *dev); +int pci_irq_vector(struct pci_dev *dev, unsigned int nr); + #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline void pci_msi_shutdown(struct pci_dev *dev) { } @@ -1307,6 +1316,24 @@ static inline int pci_enable_msix_range(struct pci_dev *dev, static inline int pci_enable_msix_exact(struct pci_dev *dev, struct msix_entry *entries, int nvec) { return -ENOSYS; } +static inline int pci_alloc_irq_vectors(struct pci_dev *dev, + unsigned int min_vecs, unsigned int max_vecs, + unsigned int flags) +{ + if (min_vecs > 1) + return -EINVAL; + return 1; +} +static inline void pci_free_irq_vectors(struct pci_dev *dev) +{ +} + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + return dev->irq; +} #endif #ifdef CONFIG_PCIEPORTBUS -- cgit v1.2.3 From 4ef33685aa0957d771e068b60a5f3ca6b47ade1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:18 +0900 Subject: PCI: Spread interrupt vectors in pci_alloc_irq_vectors() Set the affinity_mask in the PCI device before allocating vectors so that the affinity can be propagated through the MSI descriptor structures to the core IRQ code. To facilitate this, new __pci_enable_msi_range() and __pci_enable_msix_range() helpers are factored out of their not prefixed variants which assigning the new IRQ affinity mask in the PCI device so that the low-level interrupt code can perform the interrupt affinity assignment and do node-local allocations. A new PCI_IRQ_NOAFFINITY flag is added to pci_alloc_irq_vectors() so that this function can also be used by drivers that don't wish to use the automatic affinity assignment. [bhelgaas: omit "else" after "return" consistently] Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- Documentation/PCI/MSI-HOWTO.txt | 4 ++ drivers/pci/msi.c | 134 ++++++++++++++++++++++++++-------------- include/linux/pci.h | 2 + 3 files changed, 95 insertions(+), 45 deletions(-) (limited to 'drivers/pci') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 0ac612b8c3fb..c55df2911136 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -99,6 +99,10 @@ PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in case the device does not support legacy interrupt lines. +By default this function will spread the interrupts around the available +CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY +flag. + To get the Linux IRQ numbers passed to request_irq() and free_irq() and the vectors, use the following function: diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 5e5ab478ea7d..a02981efdad5 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -569,6 +569,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); entry->nvec_used = nvec; + entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; @@ -680,10 +681,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct msix_entry *entries, int nvec) { + const struct cpumask *mask = NULL; struct msi_desc *entry; - int i; + int cpu = -1, i; for (i = 0; i < nvec; i++) { + if (dev->irq_affinity) { + cpu = cpumask_next(cpu, dev->irq_affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(dev->irq_affinity); + mask = cpumask_of(cpu); + } + entry = alloc_msi_entry(&dev->dev); if (!entry) { if (!i) @@ -703,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; entry->nvec_used = 1; + entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); } @@ -1028,19 +1038,8 @@ int pci_msi_enabled(void) } EXPORT_SYMBOL(pci_msi_enabled); -/** - * pci_enable_msi_range - configure device's MSI capability structure - * @dev: device to configure - * @minvec: minimal number of interrupts to configure - * @maxvec: maximum number of interrupts to configure - * - * This function tries to allocate a maximum possible number of interrupts in a - * range between @minvec and @maxvec. It returns a negative errno if an error - * occurs. If it succeeds, it returns the actual number of interrupts allocated - * and updates the @dev's irq member to the lowest new interrupt number; - * the other interrupt numbers allocated to this device are consecutive. - **/ -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, + unsigned int flags) { int nvec; int rc; @@ -1063,25 +1062,85 @@ int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; - else if (nvec < minvec) + if (nvec < minvec) return -EINVAL; - else if (nvec > maxvec) + + if (nvec > maxvec) nvec = maxvec; - do { + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) + return -ENOSPC; + } + rc = msi_capability_init(dev, nvec); - if (rc < 0) { + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) return rc; - } else if (rc > 0) { - if (rc < minvec) + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } +} + +/** + * pci_enable_msi_range - configure device's MSI capability structure + * @dev: device to configure + * @minvec: minimal number of interrupts to configure + * @maxvec: maximum number of interrupts to configure + * + * This function tries to allocate a maximum possible number of interrupts in a + * range between @minvec and @maxvec. It returns a negative errno if an error + * occurs. If it succeeds, it returns the actual number of interrupts allocated + * and updates the @dev's irq member to the lowest new interrupt number; + * the other interrupt numbers allocated to this device are consecutive. + **/ +int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +{ + return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY); +} +EXPORT_SYMBOL(pci_enable_msi_range); + +static int __pci_enable_msix_range(struct pci_dev *dev, + struct msix_entry *entries, int minvec, int maxvec, + unsigned int flags) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) return -ENOSPC; - nvec = rc; } - } while (rc); - return nvec; + rc = pci_enable_msix(dev, entries, nvec); + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) + return rc; + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } } -EXPORT_SYMBOL(pci_enable_msi_range); /** * pci_enable_msix_range - configure device's MSI-X capability structure @@ -1099,26 +1158,10 @@ EXPORT_SYMBOL(pci_enable_msi_range); * with new allocated MSI-X interrupts. **/ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) + int minvec, int maxvec) { - int nvec = maxvec; - int rc; - - if (maxvec < minvec) - return -ERANGE; - - do { - rc = pci_enable_msix(dev, entries, nvec); - if (rc < 0) { - return rc; - } else if (rc > 0) { - if (rc < minvec) - return -ENOSPC; - nvec = rc; - } - } while (rc); - - return nvec; + return __pci_enable_msix_range(dev, entries, minvec, maxvec, + PCI_IRQ_NOAFFINITY); } EXPORT_SYMBOL(pci_enable_msix_range); @@ -1145,13 +1188,14 @@ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, int vecs = -ENOSPC; if (!(flags & PCI_IRQ_NOMSIX)) { - vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); + vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs, + flags); if (vecs > 0) return vecs; } if (!(flags & PCI_IRQ_NOMSI)) { - vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); + vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags); if (vecs > 0) return vecs; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 52ecd49e8049..f1406619f868 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -320,6 +320,7 @@ struct pci_dev { * directly, use the values stored here. They might be different! */ unsigned int irq; + struct cpumask *irq_affinity; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ bool match_driver; /* Skip attaching driver */ @@ -1240,6 +1241,7 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ #define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ #define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ +#define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */ /* kmem_cache style wrapper around pci_alloc_consistent() */ -- cgit v1.2.3