From 950334bcf17a6ab55ce13d3bdf050f7b429320d5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sat, 28 May 2016 18:09:16 -0500 Subject: PCI: Add devm_request_pci_bus_resources() Several host bridge drivers iterate through the list of bridge windows to request resources. Several others don't request the window resources at all. Add a devm_request_pci_bus_resources() interface to make it easier for drivers to request all the window resources. Export to GPL modules (from Arnd Bergmann ). Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/pci.h') diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..6ac83606cb0d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1143,9 +1143,12 @@ void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); void pci_free_resource_list(struct list_head *resources); -void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, unsigned int flags); +void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, + unsigned int flags); struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n); void pci_bus_remove_resources(struct pci_bus *bus); +int devm_request_pci_bus_resources(struct device *dev, + struct list_head *resources); #define pci_bus_for_each_resource(bus, res, i) \ for (i = 0; \ -- cgit v1.2.3 From 4d3f13845957a87729a324cce8509fad8826ef52 Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Fri, 10 Jun 2016 21:55:11 +0200 Subject: PCI: Add pci_unmap_iospace() to unmap I/O resources Add pci_unmap_iospace() to undo what pci_remap_iospace() did. This is needed to support hotplug removal of host bridges that use pci_remap_iospace(). [bhelgaas: changelog] Signed-off-by: Sinan Kaya Signed-off-by: Tomasz Nowicki Signed-off-by: Bjorn Helgaas Acked-by: Lorenzo Pieralisi --- drivers/pci/pci.c | 18 ++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include/linux/pci.h') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c8b4dbdd1bdd..eb431b5c3685 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include "pci.h" @@ -3165,6 +3166,23 @@ int __weak pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr) #endif } +/** + * pci_unmap_iospace - Unmap the memory mapped I/O space + * @res: resource to be unmapped + * + * Unmap the CPU virtual address @res from virtual address space. + * Only architectures that have memory mapped IO functions defined + * (and the PCI_IOBASE value defined) should call this function. + */ +void pci_unmap_iospace(struct resource *res) +{ +#if defined(PCI_IOBASE) && defined(CONFIG_MMU) + unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start; + + unmap_kernel_range(vaddr, resource_size(res)); +#endif +} + static void __pci_set_master(struct pci_dev *dev, bool enable) { u16 old_cmd, cmd; diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..12349deb688c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1167,6 +1167,7 @@ int pci_register_io_range(phys_addr_t addr, resource_size_t size); unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); +void pci_unmap_iospace(struct resource *res); static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar) { -- cgit v1.2.3 From 935c760ec8101413248da23b6df45f0a7a643c62 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Fri, 10 Jun 2016 21:55:13 +0200 Subject: PCI/ACPI: Add generic MCFG table handling On ACPI systems that support memory-mapped config space access, i.e., ECAM, the PCI Firmware Specification says the OS can learn where the ECAM space is from either: - the static MCFG table (for non-hotpluggable bridges), or - the _CBA method (for hotpluggable bridges) The current MCFG table handling code cannot be easily generalized owing to x86-specific quirks, which makes it hard to reuse on other architectures. Implement generic MCFG handling from scratch, including: - Simple MCFG table parsing (via pci_mmcfg_late_init() as in current x86) - MCFG region lookup for a (domain, bus_start, bus_end) tuple [bhelgaas: changelog] Signed-off-by: Tomasz Nowicki Signed-off-by: Jayachandran C Signed-off-by: Bjorn Helgaas Reviewed-by: Lorenzo Pieralisi --- drivers/acpi/Kconfig | 3 ++ drivers/acpi/Makefile | 1 + drivers/acpi/pci_mcfg.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci-acpi.h | 2 ++ include/linux/pci.h | 2 +- 5 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 drivers/acpi/pci_mcfg.c (limited to 'include/linux/pci.h') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index b7e2e776397d..f98c3287256e 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -217,6 +217,9 @@ config ACPI_PROCESSOR_IDLE bool select CPU_IDLE +config ACPI_MCFG + bool + config ACPI_CPPC_LIB bool depends on ACPI_PROCESSOR diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 251ce85a66fb..632e81feef69 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -40,6 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o acpi-y += ec.o acpi-$(CONFIG_ACPI_DOCK) += dock.o acpi-y += pci_root.o pci_link.o pci_irq.o +obj-$(CONFIG_ACPI_MCFG) += pci_mcfg.o acpi-y += acpi_lpss.o acpi_apd.o acpi-y += acpi_platform.o acpi-y += acpi_pnp.o diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c new file mode 100644 index 000000000000..b5b376e081f5 --- /dev/null +++ b/drivers/acpi/pci_mcfg.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2016 Broadcom + * Author: Jayachandran C + * Copyright (C) 2016 Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation (the "GPL"). + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 (GPLv2) for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 (GPLv2) along with this source code. + */ + +#define pr_fmt(fmt) "ACPI: " fmt + +#include +#include +#include + +/* Structure to hold entries from the MCFG table */ +struct mcfg_entry { + struct list_head list; + phys_addr_t addr; + u16 segment; + u8 bus_start; + u8 bus_end; +}; + +/* List to save MCFG entries */ +static LIST_HEAD(pci_mcfg_list); + +phys_addr_t pci_mcfg_lookup(u16 seg, struct resource *bus_res) +{ + struct mcfg_entry *e; + + /* + * We expect exact match, unless MCFG entry end bus covers more than + * specified by caller. + */ + list_for_each_entry(e, &pci_mcfg_list, list) { + if (e->segment == seg && e->bus_start == bus_res->start && + e->bus_end >= bus_res->end) + return e->addr; + } + + return 0; +} + +static __init int pci_mcfg_parse(struct acpi_table_header *header) +{ + struct acpi_table_mcfg *mcfg; + struct acpi_mcfg_allocation *mptr; + struct mcfg_entry *e, *arr; + int i, n; + + if (header->length < sizeof(struct acpi_table_mcfg)) + return -EINVAL; + + n = (header->length - sizeof(struct acpi_table_mcfg)) / + sizeof(struct acpi_mcfg_allocation); + mcfg = (struct acpi_table_mcfg *)header; + mptr = (struct acpi_mcfg_allocation *) &mcfg[1]; + + arr = kcalloc(n, sizeof(*arr), GFP_KERNEL); + if (!arr) + return -ENOMEM; + + for (i = 0, e = arr; i < n; i++, mptr++, e++) { + e->segment = mptr->pci_segment; + e->addr = mptr->address; + e->bus_start = mptr->start_bus_number; + e->bus_end = mptr->end_bus_number; + list_add(&e->list, &pci_mcfg_list); + } + + pr_info("MCFG table detected, %d entries\n", n); + return 0; +} + +/* Interface called by ACPI - parse and save MCFG table */ +void __init pci_mmcfg_late_init(void) +{ + int err = acpi_table_parse(ACPI_SIG_MCFG, pci_mcfg_parse); + if (err) + pr_err("Failed to parse MCFG (%d)\n", err); +} diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 89ab0572dbc6..7d63a66e8ed4 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -24,6 +24,8 @@ static inline acpi_status pci_acpi_remove_pm_notifier(struct acpi_device *dev) } extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle); +extern phys_addr_t pci_mcfg_lookup(u16 domain, struct resource *bus_res); + static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { struct pci_bus *pbus = pdev->bus; diff --git a/include/linux/pci.h b/include/linux/pci.h index 12349deb688c..ce03d650279b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1723,7 +1723,7 @@ void pcibios_free_irq(struct pci_dev *dev); extern struct dev_pm_ops pcibios_pm_ops; #endif -#ifdef CONFIG_PCI_MMCONFIG +#if defined(CONFIG_PCI_MMCONFIG) || defined(CONFIG_ACPI_MCFG) void __init pci_mmcfg_early_init(void); void __init pci_mmcfg_late_init(void); #else -- cgit v1.2.3 From 9c7cb891ecfea3b88e4fa255afeec0da84ea6a86 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Fri, 10 Jun 2016 21:55:14 +0200 Subject: PCI: Refactor pci_bus_assign_domain_nr() for CONFIG_PCI_DOMAINS_GENERIC Instead of assigning bus->domain_nr inside pci_bus_assign_domain_nr(), return the domain and let the caller do the assignment. Rename pci_bus_assign_domain_nr() to pci_bus_find_domain_nr() to reflect this. No functional change intended. [bhelgaas: changelog] Signed-off-by: Tomasz Nowicki Signed-off-by: Bjorn Helgaas Reviewed-by: Lorenzo Pieralisi --- drivers/pci/pci.c | 4 ++-- drivers/pci/probe.c | 4 +++- include/linux/pci.h | 7 +------ 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux/pci.h') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index eb431b5c3685..b9a783385eae 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4941,7 +4941,7 @@ int pci_get_new_domain_nr(void) } #ifdef CONFIG_PCI_DOMAINS_GENERIC -void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent) +int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) { static int use_dt_domains = -1; int domain = -1; @@ -4985,7 +4985,7 @@ void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent) domain = -1; } - bus->domain_nr = domain; + return domain; } #endif #endif diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e3ef720997d..380d46dc9a70 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2127,7 +2127,9 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus, b->sysdata = sysdata; b->ops = ops; b->number = b->busn_res.start = bus; - pci_bus_assign_domain_nr(b, parent); +#ifdef CONFIG_PCI_DOMAINS_GENERIC + b->domain_nr = pci_bus_find_domain_nr(b, parent); +#endif b2 = pci_find_bus(pci_domain_nr(b), bus); if (b2) { /* If we already got to this bus through a different bridge, ignore it */ diff --git a/include/linux/pci.h b/include/linux/pci.h index ce03d650279b..48839e817ef2 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1390,12 +1390,7 @@ static inline int pci_domain_nr(struct pci_bus *bus) { return bus->domain_nr; } -void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent); -#else -static inline void pci_bus_assign_domain_nr(struct pci_bus *bus, - struct device *parent) -{ -} +int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); #endif /* some architectures require additional setup to direct VGA traffic */ -- cgit v1.2.3 From 2ab51ddeca2fc32a7040d8560415be3366fa9ba7 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Fri, 10 Jun 2016 15:36:26 -0500 Subject: ARM64: PCI: Add acpi_pci_bus_find_domain_nr() Extend pci_bus_find_domain_nr() so it can find the domain from either: - ACPI, via the new acpi_pci_bus_find_domain_nr() interface, or - DT, via of_pci_bus_find_domain_nr() Note that this is only used for CONFIG_PCI_DOMAINS_GENERIC=y, so it does not affect x86 or ia64. [bhelgaas: changelog] Signed-off-by: Tomasz Nowicki Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- arch/arm64/kernel/pci.c | 7 +++++++ drivers/pci/pci.c | 4 +++- include/linux/pci.h | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux/pci.h') diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 3c4e308b40a0..d5d3d26834cf 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -17,6 +17,7 @@ #include #include #include +#include #include /* @@ -85,6 +86,12 @@ EXPORT_SYMBOL(pcibus_to_node); #endif #ifdef CONFIG_ACPI + +int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +{ + return 0; +} + /* Root bridge scanning */ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 97f7cd4a7e86..4834ceeca0d2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -7,6 +7,7 @@ * Copyright 1997 -- 2000 Martin Mares */ +#include #include #include #include @@ -4990,7 +4991,8 @@ static int of_pci_bus_find_domain_nr(struct device *parent) int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) { - return of_pci_bus_find_domain_nr(parent); + return acpi_disabled ? of_pci_bus_find_domain_nr(parent) : + acpi_pci_bus_find_domain_nr(bus); } #endif #endif diff --git a/include/linux/pci.h b/include/linux/pci.h index 48839e817ef2..0671e9a840cb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1390,6 +1390,12 @@ static inline int pci_domain_nr(struct pci_bus *bus) { return bus->domain_nr; } +#ifdef CONFIG_ACPI +int acpi_pci_bus_find_domain_nr(struct pci_bus *bus); +#else +static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +{ return 0; } +#endif int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); #endif -- cgit v1.2.3 From 9661e783d830699a65f7d42e4b4a76e6923089eb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Jun 2016 20:48:33 +0300 Subject: PCI / PM: Enforce type casting for pci_power_t When casting variables of type pci_power_t, a static analysis tool complains: include/linux/pci.h:119:37: warning: cast from restricted pci_power_t Enforce type casting to make the static analyzer happy. Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/pci.h') diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..8d748345b158 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -116,7 +116,7 @@ extern const char *pci_power_names[]; static inline const char *pci_power_name(pci_power_t state) { - return pci_power_names[1 + (int) state]; + return pci_power_names[1 + (__force int) state]; } #define PCI_PM_D2_DELAY 200 -- cgit v1.2.3 From 9d26d3a8f1b0c442339a235f9508bdad8af91043 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 2 Jun 2016 11:17:12 +0300 Subject: PCI: Put PCIe ports into D3 during suspend Currently the Linux PCI core does not touch power state of PCI bridges and PCIe ports when system suspend is entered. Leaving them in D0 consumes power unnecessarily and may prevent the CPU from entering deeper C-states. With recent PCIe hardware we can power down the ports to save power given that we take into account few restrictions: - The PCIe port hardware is recent enough, starting from 2015. - Devices connected to PCIe ports are effectively in D3cold once the port is transitioned to D3 (the config space is not accessible anymore and the link may be powered down). - Devices behind the PCIe port need to be allowed to transition to D3cold and back. There is a way both drivers and userspace can forbid this. - If the device behind the PCIe port is capable of waking the system it needs to be able to do so from D3cold. This patch adds a new flag to struct pci_device called 'bridge_d3'. This flag is set and cleared by the PCI core whenever there is a change in power management state of any of the devices behind the PCIe port. When system later on is suspended we only need to check this flag and if it is true transition the port to D3 otherwise we leave it in D0. Also provide override mechanism via command line parameter "pcie_port_pm=[off|force]" that can be used to disable or enable the feature regardless of the BIOS manufacturing date. Tested-by: Lukas Wunner Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 4 + drivers/pci/bus.c | 1 + drivers/pci/pci-driver.c | 5 +- drivers/pci/pci-sysfs.c | 5 ++ drivers/pci/pci.c | 175 ++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 11 +++ drivers/pci/remove.c | 2 + drivers/usb/host/xhci-pci.c | 2 +- include/linux/pci.h | 3 + 9 files changed, 203 insertions(+), 5 deletions(-) (limited to 'include/linux/pci.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..86edee4cedd4 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3047,6 +3047,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. compat Treat PCIe ports as PCI-to-PCI bridges, disable the PCIe ports driver. + pcie_port_pm= [PCIE] PCIe port power management handling: + off Disable power management of all PCIe ports + force Forcibly enable power management of all PCIe ports + pcie_pme= [PCIE,PM] Native PCIe PME signaling options: nomsi Do not use MSI for native PCIe PME signaling (this makes all PCIe root ports use INTx for all services). diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index dd7cdbee8029..28731360b457 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -291,6 +291,7 @@ void pci_bus_add_device(struct pci_dev *dev) pci_fixup_device(pci_fixup_final, dev); pci_create_sysfs_dev_files(dev); pci_proc_attach_device(dev); + pci_bridge_d3_device_changed(dev); dev->match_driver = true; retval = device_attach(&dev->dev); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d7ffd66814bb..e39a67c8ef39 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -777,7 +777,7 @@ static int pci_pm_suspend_noirq(struct device *dev) if (!pci_dev->state_saved) { pci_save_state(pci_dev); - if (!pci_has_subordinate(pci_dev)) + if (pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } @@ -1144,7 +1144,6 @@ static int pci_pm_runtime_suspend(struct device *dev) return -ENOSYS; pci_dev->state_saved = false; - pci_dev->no_d3cold = false; error = pm->runtime_suspend(dev); if (error) { /* @@ -1161,8 +1160,6 @@ static int pci_pm_runtime_suspend(struct device *dev) return error; } - if (!pci_dev->d3cold_allowed) - pci_dev->no_d3cold = true; pci_fixup_device(pci_fixup_suspend, pci_dev); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index d319a9ca9b7b..bcd10c795284 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -406,6 +406,11 @@ static ssize_t d3cold_allowed_store(struct device *dev, return -EINVAL; pdev->d3cold_allowed = !!val; + if (pdev->d3cold_allowed) + pci_d3cold_enable(pdev); + else + pci_d3cold_disable(pdev); + pm_runtime_resume(dev); return count; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c8b4dbdd1bdd..9ff7183e25a2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -101,6 +102,21 @@ unsigned int pcibios_max_latency = 255; /* If set, the PCIe ARI capability will not be used. */ static bool pcie_ari_disabled; +/* Disable bridge_d3 for all PCIe ports */ +static bool pci_bridge_d3_disable; +/* Force bridge_d3 for all PCIe ports */ +static bool pci_bridge_d3_force; + +static int __init pcie_port_pm_setup(char *str) +{ + if (!strcmp(str, "off")) + pci_bridge_d3_disable = true; + else if (!strcmp(str, "force")) + pci_bridge_d3_force = true; + return 1; +} +__setup("pcie_port_pm=", pcie_port_pm_setup); + /** * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children * @bus: pointer to PCI bus structure to search @@ -2155,6 +2171,164 @@ void pci_config_pm_runtime_put(struct pci_dev *pdev) pm_runtime_put_sync(parent); } +/** + * pci_bridge_d3_possible - Is it possible to put the bridge into D3 + * @bridge: Bridge to check + * + * This function checks if it is possible to move the bridge to D3. + * Currently we only allow D3 for recent enough PCIe ports. + */ +static bool pci_bridge_d3_possible(struct pci_dev *bridge) +{ + unsigned int year; + + if (!pci_is_pcie(bridge)) + return false; + + switch (pci_pcie_type(bridge)) { + case PCI_EXP_TYPE_ROOT_PORT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_DOWNSTREAM: + if (pci_bridge_d3_disable) + return false; + if (pci_bridge_d3_force) + return true; + + /* + * It should be safe to put PCIe ports from 2015 or newer + * to D3. + */ + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2015) { + return true; + } + break; + } + + return false; +} + +static int pci_dev_check_d3cold(struct pci_dev *dev, void *data) +{ + bool *d3cold_ok = data; + bool no_d3cold; + + /* + * The device needs to be allowed to go D3cold and if it is wake + * capable to do so from D3cold. + */ + no_d3cold = dev->no_d3cold || !dev->d3cold_allowed || + (device_may_wakeup(&dev->dev) && !pci_pme_capable(dev, PCI_D3cold)) || + !pci_power_manageable(dev); + + *d3cold_ok = !no_d3cold; + + return no_d3cold; +} + +/* + * pci_bridge_d3_update - Update bridge D3 capabilities + * @dev: PCI device which is changed + * @remove: Is the device being removed + * + * Update upstream bridge PM capabilities accordingly depending on if the + * device PM configuration was changed or the device is being removed. The + * change is also propagated upstream. + */ +static void pci_bridge_d3_update(struct pci_dev *dev, bool remove) +{ + struct pci_dev *bridge; + bool d3cold_ok = true; + + bridge = pci_upstream_bridge(dev); + if (!bridge || !pci_bridge_d3_possible(bridge)) + return; + + pci_dev_get(bridge); + /* + * If the device is removed we do not care about its D3cold + * capabilities. + */ + if (!remove) + pci_dev_check_d3cold(dev, &d3cold_ok); + + if (d3cold_ok) { + /* + * We need to go through all children to find out if all of + * them can still go to D3cold. + */ + pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold, + &d3cold_ok); + } + + if (bridge->bridge_d3 != d3cold_ok) { + bridge->bridge_d3 = d3cold_ok; + /* Propagate change to upstream bridges */ + pci_bridge_d3_update(bridge, false); + } + + pci_dev_put(bridge); +} + +/** + * pci_bridge_d3_device_changed - Update bridge D3 capabilities on change + * @dev: PCI device that was changed + * + * If a device is added or its PM configuration, such as is it allowed to + * enter D3cold, is changed this function updates upstream bridge PM + * capabilities accordingly. + */ +void pci_bridge_d3_device_changed(struct pci_dev *dev) +{ + pci_bridge_d3_update(dev, false); +} + +/** + * pci_bridge_d3_device_removed - Update bridge D3 capabilities on remove + * @dev: PCI device being removed + * + * Function updates upstream bridge PM capabilities based on other devices + * still left on the bus. + */ +void pci_bridge_d3_device_removed(struct pci_dev *dev) +{ + pci_bridge_d3_update(dev, true); +} + +/** + * pci_d3cold_enable - Enable D3cold for device + * @dev: PCI device to handle + * + * This function can be used in drivers to enable D3cold from the device + * they handle. It also updates upstream PCI bridge PM capabilities + * accordingly. + */ +void pci_d3cold_enable(struct pci_dev *dev) +{ + if (dev->no_d3cold) { + dev->no_d3cold = false; + pci_bridge_d3_device_changed(dev); + } +} +EXPORT_SYMBOL_GPL(pci_d3cold_enable); + +/** + * pci_d3cold_disable - Disable D3cold for device + * @dev: PCI device to handle + * + * This function can be used in drivers to disable D3cold from the device + * they handle. It also updates upstream PCI bridge PM capabilities + * accordingly. + */ +void pci_d3cold_disable(struct pci_dev *dev) +{ + if (!dev->no_d3cold) { + dev->no_d3cold = true; + pci_bridge_d3_device_changed(dev); + } +} +EXPORT_SYMBOL_GPL(pci_d3cold_disable); + /** * pci_pm_init - Initialize PM functions of given PCI device * @dev: PCI device to handle. @@ -2189,6 +2363,7 @@ void pci_pm_init(struct pci_dev *dev) dev->pm_cap = pm; dev->d3_delay = PCI_PM_D3_WAIT; dev->d3cold_delay = PCI_PM_D3COLD_WAIT; + dev->bridge_d3 = pci_bridge_d3_possible(dev); dev->d3cold_allowed = true; dev->d1_support = false; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index a814bbb80fcb..9730c474b016 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -82,6 +82,8 @@ void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); +void pci_bridge_d3_device_changed(struct pci_dev *dev); +void pci_bridge_d3_device_removed(struct pci_dev *dev); static inline void pci_wakeup_event(struct pci_dev *dev) { @@ -94,6 +96,15 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev) return !!(pci_dev->subordinate); } +static inline bool pci_power_manageable(struct pci_dev *pci_dev) +{ + /* + * Currently we allow normal PCI devices and PCI bridges transition + * into D3 if their bridge_d3 is set. + */ + return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; +} + struct pci_vpd_ops { ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 8982026637d5..d1ef7acf6930 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -96,6 +96,8 @@ static void pci_remove_bus_device(struct pci_dev *dev) dev->subordinate = NULL; } + pci_bridge_d3_device_removed(dev); + pci_destroy_dev(dev); } diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 48672fac7ff3..ac352fe391f4 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -382,7 +382,7 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup) * need to have the registers polled during D3, so avoid D3cold. */ if (xhci->quirks & XHCI_COMP_MODE_QUIRK) - pdev->no_d3cold = true; + pci_d3cold_disable(pdev); if (xhci->quirks & XHCI_PME_STUCK_QUIRK) xhci_pme_quirk(hcd); diff --git a/include/linux/pci.h b/include/linux/pci.h index 8d748345b158..8597b423cb63 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -294,6 +294,7 @@ struct pci_dev { unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ unsigned int no_d3cold:1; /* D3cold is forbidden */ + unsigned int bridge_d3:1; /* Allow D3 for bridge */ unsigned int d3cold_allowed:1; /* D3cold is allowed by user */ unsigned int mmio_always_on:1; /* disallow turning off io/mem decoding during bar sizing */ @@ -1083,6 +1084,8 @@ int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); +void pci_d3cold_enable(struct pci_dev *dev); +void pci_d3cold_disable(struct pci_dev *dev); static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable) -- cgit v1.2.3 From 8221a013528597edc18d371b22667e8eefca599b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 17 Jun 2016 14:43:34 -0500 Subject: PCI: Unify pci_resource_to_user() declarations Replace the pci_resource_to_user() declarations in each arch that defines HAVE_ARCH_PCI_RESOURCE_TO_USER with a single one in linux/pci.h. Change the MIPS static inline implementation to a non-inline version so the static inline doesn't conflict with the new non-static linux/pci.h declaration. No functional change intended. Signed-off-by: Bjorn Helgaas --- arch/microblaze/include/asm/pci.h | 3 --- arch/mips/include/asm/pci.h | 10 ---------- arch/mips/pci/pci.c | 10 ++++++++++ arch/powerpc/include/asm/pci.h | 3 --- arch/sparc/include/asm/pci_64.h | 3 --- include/linux/pci.h | 6 +++++- 6 files changed, 15 insertions(+), 20 deletions(-) (limited to 'include/linux/pci.h') diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index fc3ecb55f1b2..2a120bb70e54 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -82,9 +82,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); #define HAVE_ARCH_PCI_RESOURCE_TO_USER -extern void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 86b239d9d75d..9b63cd41213d 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -80,16 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, #define HAVE_ARCH_PCI_RESOURCE_TO_USER -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - phys_addr_t size = resource_size(rsrc); - - *start = fixup_bigphys_addr(rsrc->start, size); - *end = rsrc->start + size; -} - /* * Dynamic DMA mapping stuff. * MIPS has everything mapped statically. diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c index f1b11f0dea2d..5717384a986d 100644 --- a/arch/mips/pci/pci.c +++ b/arch/mips/pci/pci.c @@ -319,6 +319,16 @@ void pcibios_fixup_bus(struct pci_bus *bus) EXPORT_SYMBOL(PCIBIOS_MIN_IO); EXPORT_SYMBOL(PCIBIOS_MIN_MEM); +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, resource_size_t *start, + resource_size_t *end) +{ + phys_addr_t size = resource_size(rsrc); + + *start = fixup_bigphys_addr(rsrc->start, size); + *end = rsrc->start + size; +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index a6f3ac0d4602..e9bd6cf0212f 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -136,9 +136,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); #define HAVE_ARCH_PCI_RESOURCE_TO_USER -extern void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h index 022d16008a00..2303635158f5 100644 --- a/arch/sparc/include/asm/pci_64.h +++ b/arch/sparc/include/asm/pci_64.h @@ -55,9 +55,6 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) } #define HAVE_ARCH_PCI_RESOURCE_TO_USER -void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); #endif /* __KERNEL__ */ #endif /* __SPARC64_PCI_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..9c201d4dbd51 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1554,7 +1554,11 @@ static inline const char *pci_name(const struct pci_dev *pdev) /* Some archs don't want to expose struct resource to userland as-is * in sysfs and /proc */ -#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER +#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end); +#else static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) -- cgit v1.2.3 From 224abb67e6eb5ac062de9239163136d5ec3155c8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 17 Jun 2016 15:23:52 -0500 Subject: PCI: Document connection between pci_power_t and hardware PM capability The dev.pme_support field, pci_pm_init(), pci_pme_capable(), and pci_raw_set_power_state() depend on the fact that the pci_power_t values (PCI_D0, PCI_D1, etc.) match the definition of the Capabilities PME_Support and the Control/Status PowerState fields in the Power Management capability (see PCI Bus Power Management spec r1.2, sec 3.2.3). Add a note to this effect at the pci_power_t typedef. Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg --- include/linux/pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/pci.h') diff --git a/include/linux/pci.h b/include/linux/pci.h index 8597b423cb63..0a1a9e30359c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -101,6 +101,10 @@ enum { DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES, }; +/* + * pci_power_t values must match the bits in the Capabilities PME_Support + * and Control/Status PowerState fields in the Power Management capability. + */ typedef int __bitwise pci_power_t; #define PCI_D0 ((pci_power_t __force) 0) -- cgit v1.2.3 From be9d2e8927cef02076bb7b5b2637cd9f4be2e8df Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 7 Jun 2016 09:44:01 +0200 Subject: PCI: Add helpers to request/release memory and I/O regions Add helpers to request and release a device's memory or I/O regions. With these helpers in place, one does not need to select a device's memory or I/O regions with pci_select_bars() prior to requesting or releasing them. Suggested-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- include/linux/pci.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux/pci.h') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9c201d4dbd51..6af3b93f0710 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1402,6 +1402,34 @@ typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); void pci_register_set_vga_state(arch_set_vga_state_t func); +static inline int +pci_request_io_regions(struct pci_dev *pdev, const char *name) +{ + return pci_request_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_IO), name); +} + +static inline void +pci_release_io_regions(struct pci_dev *pdev) +{ + return pci_release_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_IO)); +} + +static inline int +pci_request_mem_regions(struct pci_dev *pdev, const char *name) +{ + return pci_request_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_MEM), name); +} + +static inline void +pci_release_mem_regions(struct pci_dev *pdev) +{ + return pci_release_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_MEM)); +} + #else /* CONFIG_PCI is not enabled */ static inline void pci_set_flags(int flags) { } -- cgit v1.2.3 From 765bf9b739731b925ca26a8f05765b87f2bd9724 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Wed, 8 Jun 2016 12:04:47 +0100 Subject: PCI: Add generic pci_bus_claim_resources() All PCI resources (bridge windows and BARs) should be inserted in the iomem_resource and ioport_resource trees so we know what space is occupied and what is available for other devices. There's nothing arch-specific about this, but it is currently done by arch-specific code. Add a generic pci_bus_claim_resources() interface so we can migrate away from the arch-specific code. [bhelgaas: changelog] Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas CC: Arnd Bergmann CC: Yinghai Lu --- drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 69 insertions(+) (limited to 'include/linux/pci.h') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 55641a39a3e9..1d1a2c952c35 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1423,6 +1423,74 @@ void pci_bus_assign_resources(const struct pci_bus *bus) } EXPORT_SYMBOL(pci_bus_assign_resources); +static void pci_claim_device_resources(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->parent) + continue; + + pci_claim_resource(dev, i); + } +} + +static void pci_claim_bridge_resources(struct pci_dev *dev) +{ + int i; + + for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->parent) + continue; + + pci_claim_bridge_resource(dev, i); + } +} + +static void pci_bus_allocate_dev_resources(struct pci_bus *b) +{ + struct pci_dev *dev; + struct pci_bus *child; + + list_for_each_entry(dev, &b->devices, bus_list) { + pci_claim_device_resources(dev); + + child = dev->subordinate; + if (child) + pci_bus_allocate_dev_resources(child); + } +} + +static void pci_bus_allocate_resources(struct pci_bus *b) +{ + struct pci_bus *child; + + /* + * Carry out a depth-first search on the PCI bus + * tree to allocate bridge apertures. Read the + * programmed bridge bases and recursively claim + * the respective bridge resources. + */ + if (b->self) { + pci_read_bridge_bases(b); + pci_claim_bridge_resources(b->self); + } + + list_for_each_entry(child, &b->children, node) + pci_bus_allocate_resources(child); +} + +void pci_bus_claim_resources(struct pci_bus *b) +{ + pci_bus_allocate_resources(b); + pci_bus_allocate_dev_resources(b); +} +EXPORT_SYMBOL(pci_bus_claim_resources); + static void __pci_bridge_assign_resources(const struct pci_dev *bridge, struct list_head *add_head, struct list_head *fail_head) diff --git a/include/linux/pci.h b/include/linux/pci.h index 6af3b93f0710..d6cfecfee864 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1114,6 +1114,7 @@ int pci_set_vpd_size(struct pci_dev *dev, size_t len); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); void pci_bus_assign_resources(const struct pci_bus *bus); +void pci_bus_claim_resources(struct pci_bus *bus); void pci_bus_size_bridges(struct pci_bus *bus); int pci_claim_resource(struct pci_dev *, int); int pci_claim_bridge_resource(struct pci_dev *bridge, int i); -- cgit v1.2.3 From aff171641d181ea573380efc3f559c9de4741fc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:17 +0900 Subject: PCI: Provide sensible IRQ vector alloc/free routines Add a function to allocate and free a range of interrupt vectors, using MSI-X, MSI or legacy vectors (in that order) based on the capabilities of the underlying device and PCIe complex. Additionally a new helper is provided to get the Linux IRQ number for given device-relative vector so that the drivers don't need to allocate their own arrays to keep track of the vectors for the multi vector MSI-X case. Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- Documentation/PCI/MSI-HOWTO.txt | 467 +++++++--------------------------------- drivers/pci/msi.c | 89 ++++++++ include/linux/pci.h | 27 +++ 3 files changed, 192 insertions(+), 391 deletions(-) (limited to 'include/linux/pci.h') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1179850f453c..0ac612b8c3fb 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -78,422 +78,107 @@ CONFIG_PCI_MSI option. 4.2 Using MSI -Most of the hard work is done for the driver in the PCI layer. It simply -has to request that the PCI layer set up the MSI capability for this +Most of the hard work is done for the driver in the PCI layer. The driver +simply has to request that the PCI layer set up the MSI capability for this device. -4.2.1 pci_enable_msi +To automatically use MSI or MSI-X interrupt vectors, use the following +function: -int pci_enable_msi(struct pci_dev *dev) + int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); -A successful call allocates ONE interrupt to the device, regardless -of how many MSIs the device supports. The device is switched from -pin-based interrupt mode to MSI mode. The dev->irq number is changed -to a new number which represents the message signaled interrupt; -consequently, this function should be called before the driver calls -request_irq(), because an MSI is delivered via a vector that is -different from the vector of a pin-based interrupt. +which allocates up to max_vecs interrupt vectors for a PCI device. It +returns the number of vectors allocated or a negative error. If the device +has a requirements for a minimum number of vectors the driver can pass a +min_vecs argument set to this limit, and the PCI core will return -ENOSPC +if it can't meet the minimum number of vectors. -4.2.2 pci_enable_msi_range +The flags argument should normally be set to 0, but can be used to pass the +PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support +MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in +case the device does not support legacy interrupt lines. -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +To get the Linux IRQ numbers passed to request_irq() and free_irq() and the +vectors, use the following function: -This function allows a device driver to request any number of MSI -interrupts within specified range from 'minvec' to 'maxvec'. + int pci_irq_vector(struct pci_dev *dev, unsigned int nr); -If this function returns a positive number it indicates the number of -MSI interrupts that have been successfully allocated. In this case -the device is switched from pin-based interrupt mode to MSI mode and -updates dev->irq to be the lowest of the new interrupts assigned to it. -The other interrupts assigned to the device are in the range dev->irq -to dev->irq + returned value - 1. Device driver can use the returned -number of successfully allocated MSI interrupts to further allocate -and initialize device resources. +Any allocated resources should be freed before removing the device using +the following function: -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. + void pci_free_irq_vectors(struct pci_dev *dev); -This function should be called before the driver calls request_irq(), -because MSI interrupts are delivered via vectors that are different -from the vector of a pin-based interrupt. +If a device supports both MSI-X and MSI capabilities, this API will use the +MSI-X facilities in preference to the MSI facilities. MSI-X supports any +number of interrupts between 1 and 2048. In contrast, MSI is restricted to +a maximum of 32 interrupts (and must be a power of two). In addition, the +MSI interrupt vectors must be allocated consecutively, so the system might +not be able to allocate as many vectors for MSI as it could for MSI-X. On +some platforms, MSI interrupts must all be targeted at the same set of CPUs +whereas MSI-X interrupts can all be targeted at different CPUs. -It is ideal if drivers can cope with a variable number of MSI interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. +If a device supports neither MSI-X or MSI it will fall back to a single +legacy IRQ vector. -There could be devices that can not operate with just any number of MSI -interrupts within a range. See chapter 4.3.1.3 to get the idea how to -handle such devices for MSI-X - the same logic applies to MSI. +The typical usage of MSI or MSI-X interrupts is to allocate as many vectors +as possible, likely up to the limit supported by the device. If nvec is +larger than the number supported by the device it will automatically be +capped to the supported limit, so there is no need to query the number of +vectors supported beforehand: -4.2.1.1 Maximum possible number of MSI interrupts - -The typical usage of MSI interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msi_vec_count() function: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.2.1.2 Exact number of MSI interrupts + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, 0); + if (nvec < 0) + goto out_err; If a driver is unable or unwilling to deal with a variable number of MSI -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msi_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, nvec, nvec); -} - -Note, unlike pci_enable_msi_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msi_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msi_exact() does). - -4.2.1.3 Single MSI mode - -The most notorious example of the request type described above is -enabling the single MSI mode for a device. It could be done by passing -two 1s as 'minvec' and 'maxvec': - -static int foo_driver_enable_single_msi(struct pci_dev *pdev) -{ - return pci_enable_msi_range(pdev, 1, 1); -} - -Note, unlike pci_enable_msi() function, which could be also used to -enable the single MSI mode, pci_enable_msi_range() returns either a -negative errno or 1 (not negative errno or 0 - as pci_enable_msi() -does). - -4.2.3 pci_enable_msi_exact - -int pci_enable_msi_exact(struct pci_dev *dev, int nvec) - -This variation on pci_enable_msi_range() call allows a device driver to -request exactly 'nvec' MSIs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. - -By contrast with pci_enable_msi_range() function, pci_enable_msi_exact() -returns zero in case of success, which indicates MSI interrupts have been -successfully allocated. - -4.2.4 pci_disable_msi - -void pci_disable_msi(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msi_range(). -Calling it restores dev->irq to the pin-based interrupt number and frees -the previously allocated MSIs. The interrupts may subsequently be assigned -to another device, so drivers should not cache the value of dev->irq. - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI enabled and thus leaking its vector. - -4.2.4 pci_msi_vec_count - -int pci_msi_vec_count(struct pci_dev *dev) - -This function could be used to retrieve the number of MSI vectors the -device requested (via the Multiple Message Capable register). The MSI -specification only allows the returned value to be a power of two, -up to a maximum of 2^5 (32). - -If this function returns a negative number, it indicates the device is -not capable of sending MSIs. - -If this function returns a positive number, it indicates the maximum -number of MSI interrupt vectors that could be allocated. - -4.3 Using MSI-X - -The MSI-X capability is much more flexible than the MSI capability. -It supports up to 2048 interrupts, each of which can be controlled -independently. To support this flexibility, drivers must use an array of -`struct msix_entry': - -struct msix_entry { - u16 vector; /* kernel uses to write alloc vector */ - u16 entry; /* driver uses to specify entry */ -}; - -This allows for the device to use these interrupts in a sparse fashion; -for example, it could use interrupts 3 and 1027 and yet allocate only a -two-element array. The driver is expected to fill in the 'entry' value -in each element of the array to indicate for which entries the kernel -should assign interrupts; it is invalid to fill in two entries with the -same number. - -4.3.1 pci_enable_msix_range - -int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) - -Calling this function asks the PCI subsystem to allocate any number of -MSI-X interrupts within specified range from 'minvec' to 'maxvec'. -The 'entries' argument is a pointer to an array of msix_entry structs -which should be at least 'maxvec' entries in size. - -On success, the device is switched into MSI-X mode and the function -returns the number of MSI-X interrupts that have been successfully -allocated. In this case the 'vector' member in entries numbered from -0 to the returned value - 1 is populated with the interrupt number; -the driver should then call request_irq() for each 'vector' that it -decides to use. The device driver is responsible for keeping track of the -interrupts assigned to the MSI-X vectors so it can free them again later. -Device driver can use the returned number of successfully allocated MSI-X -interrupts to further allocate and initialize device resources. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -This function, in contrast with pci_enable_msi_range(), does not adjust -dev->irq. The device will not generate interrupts for this interrupt -number once MSI-X is enabled. - -Device drivers should normally call this function once per device -during the initialization phase. - -It is ideal if drivers can cope with a variable number of MSI-X interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. - -There could be devices that can not operate with just any number of MSI-X -interrupts within a range. E.g., an network adapter might need let's say -four vectors per each queue it provides. Therefore, a number of MSI-X -interrupts allocated should be a multiple of four. In this case interface -pci_enable_msix_range() can not be used alone to request MSI-X interrupts -(since it can allocate any number within the range, without any notion of -the multiple of four) and the device driver should master a custom logic -to request the required number of MSI-X interrupts. - -4.3.1.1 Maximum possible number of MSI-X interrupts - -The typical usage of MSI-X interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msix_vec_count() function: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI-X interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.3.1.2 Exact number of MSI-X interrupts - -If a driver is unable or unwilling to deal with a variable number of MSI-X -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msix_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - nvec, nvec); -} - -Note, unlike pci_enable_msix_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msix_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msix_exact() does). - -4.3.1.3 Specific requirements to the number of MSI-X interrupts - -As noted above, there could be devices that can not operate with just any -number of MSI-X interrupts within a range. E.g., let's assume a device that -is only capable sending the number of MSI-X interrupts which is a power of -two. A routine that enables MSI-X mode for such device might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - maxvec, maxvec); - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } - - return rc; -} - -Note how pci_enable_msix_range() return value is analyzed for a fallback - -any error code other than -ENOSPC indicates a fatal error and should not -be retried. - -4.3.2 pci_enable_msix_exact - -int pci_enable_msix_exact(struct pci_dev *dev, - struct msix_entry *entries, int nvec) - -This variation on pci_enable_msix_range() call allows a device driver to -request exactly 'nvec' MSI-Xs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -By contrast with pci_enable_msix_range() function, pci_enable_msix_exact() -returns zero in case of success, which indicates MSI-X interrupts have been -successfully allocated. - -Another version of a routine that enables MSI-X mode for a device with -specific requirements described in chapter 4.3.1.3 might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_exact(adapter->pdev, - adapter->msix_entries, maxvec); - - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } else if (rc < 0) { - return rc; - } - - return maxvec; -} - -4.3.3 pci_disable_msix - -void pci_disable_msix(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msix_range(). -It frees the previously allocated MSI-X interrupts. The interrupts may -subsequently be assigned to another device, so drivers should not cache -the value of the 'vector' elements over a call to pci_disable_msix(). - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI-X enabled and thus leaking its vector. - -4.3.3 The MSI-X Table - -The MSI-X capability specifies a BAR and offset within that BAR for the -MSI-X Table. This address is mapped by the PCI subsystem, and should not -be accessed directly by the device driver. If the driver wishes to -mask or unmask an interrupt, it should call disable_irq() / enable_irq(). +interrupts it can request a particular number of interrupts by passing that +number to pci_alloc_irq_vectors() function as both 'min_vecs' and +'max_vecs' parameters: -4.3.4 pci_msix_vec_count + ret = pci_alloc_irq_vectors(pdev, nvec, nvec, 0); + if (ret < 0) + goto out_err; -int pci_msix_vec_count(struct pci_dev *dev) +The most notorious example of the request type described above is enabling +the single MSI mode for a device. It could be done by passing two 1s as +'min_vecs' and 'max_vecs': -This function could be used to retrieve number of entries in the device -MSI-X table. + ret = pci_alloc_irq_vectors(pdev, 1, 1, 0); + if (ret < 0) + goto out_err; -If this function returns a negative number, it indicates the device is -not capable of sending MSI-Xs. +Some devices might not support using legacy line interrupts, in which case +the PCI_IRQ_NOLEGACY flag can be used to fail the request if the platform +can't provide MSI or MSI-X interrupts: -If this function returns a positive number, it indicates the maximum -number of MSI-X interrupt vectors that could be allocated. + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_NOLEGACY); + if (nvec < 0) + goto out_err; -4.4 Handling devices implementing both MSI and MSI-X capabilities +4.3 Legacy APIs -If a device implements both MSI and MSI-X capabilities, it can -run in either MSI mode or MSI-X mode, but not both simultaneously. -This is a requirement of the PCI spec, and it is enforced by the -PCI layer. Calling pci_enable_msi_range() when MSI-X is already -enabled or pci_enable_msix_range() when MSI is already enabled -results in an error. If a device driver wishes to switch between MSI -and MSI-X at runtime, it must first quiesce the device, then switch -it back to pin-interrupt mode, before calling pci_enable_msi_range() -or pci_enable_msix_range() and resuming operation. This is not expected -to be a common operation but may be useful for debugging or testing -during development. +The following old APIs to enable and disable MSI or MSI-X interrupts should +not be used in new code: -4.5 Considerations when using MSIs + pci_enable_msi() /* deprecated */ + pci_enable_msi_range() /* deprecated */ + pci_enable_msi_exact() /* deprecated */ + pci_disable_msi() /* deprecated */ + pci_enable_msix_range() /* deprecated */ + pci_enable_msix_exact() /* deprecated */ + pci_disable_msix() /* deprecated */ -4.5.1 Choosing between MSI-X and MSI +Additionally there are APIs to provide the number of supported MSI or MSI-X +vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these +should be avoided in favor of letting pci_alloc_irq_vectors() cap the +number of vectors. If you have a legitimate special use case for the count +of vectors we might have to revisit that decision and add a +pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently. -If your device supports both MSI-X and MSI capabilities, you should use -the MSI-X facilities in preference to the MSI facilities. As mentioned -above, MSI-X supports any number of interrupts between 1 and 2048. -In contrast, MSI is restricted to a maximum of 32 interrupts (and -must be a power of two). In addition, the MSI interrupt vectors must -be allocated consecutively, so the system might not be able to allocate -as many vectors for MSI as it could for MSI-X. On some platforms, MSI -interrupts must all be targeted at the same set of CPUs whereas MSI-X -interrupts can all be targeted at different CPUs. +4.4 Considerations when using MSIs -4.5.2 Spinlocks +4.4.1 Spinlocks Most device drivers have a per-device spinlock which is taken in the interrupt handler. With pin-based interrupts or a single MSI, it is not @@ -505,7 +190,7 @@ acquire the spinlock. Such deadlocks can be avoided by using spin_lock_irqsave() or spin_lock_irq() which disable local interrupts and acquire the lock (see Documentation/DocBook/kernel-locking). -4.6 How to tell whether MSI/MSI-X is enabled on a device +4.5 How to tell whether MSI/MSI-X is enabled on a device Using 'lspci -v' (as root) may show some devices with "MSI", "Message Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98ace67c5f4d..5e5ab478ea7d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -4,6 +4,7 @@ * * Copyright (C) 2003-2004 Intel * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) + * Copyright (C) 2016 Christoph Hellwig. */ #include @@ -1121,6 +1122,94 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, } EXPORT_SYMBOL(pci_enable_msix_range); +/** + * pci_alloc_irq_vectors - allocate multiple IRQs for a device + * @dev: PCI device to operate on + * @min_vecs: minimum number of vectors required (must be >= 1) + * @max_vecs: maximum (desired) number of vectors + * @flags: flags or quirks for the allocation + * + * Allocate up to @max_vecs interrupt vectors for @dev, using MSI-X or MSI + * vectors if available, and fall back to a single legacy vector + * if neither is available. Return the number of vectors allocated, + * (which might be smaller than @max_vecs) if successful, or a negative + * error code on error. If less than @min_vecs interrupt vectors are + * available for @dev the function will fail with -ENOSPC. + * + * To get the Linux IRQ number used for a vector that can be passed to + * request_irq() use the pci_irq_vector() helper. + */ +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + int vecs = -ENOSPC; + + if (!(flags & PCI_IRQ_NOMSIX)) { + vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); + if (vecs > 0) + return vecs; + } + + if (!(flags & PCI_IRQ_NOMSI)) { + vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); + if (vecs > 0) + return vecs; + } + + /* use legacy irq if allowed */ + if (!(flags & PCI_IRQ_NOLEGACY) && min_vecs == 1) + return 1; + return vecs; +} +EXPORT_SYMBOL(pci_alloc_irq_vectors); + +/** + * pci_free_irq_vectors - free previously allocated IRQs for a device + * @dev: PCI device to operate on + * + * Undoes the allocations and enabling in pci_alloc_irq_vectors(). + */ +void pci_free_irq_vectors(struct pci_dev *dev) +{ + pci_disable_msix(dev); + pci_disable_msi(dev); +} +EXPORT_SYMBOL(pci_free_irq_vectors); + +/** + * pci_irq_vector - return Linux IRQ number of a device vector + * @dev: PCI device to operate on + * @nr: device-relative interrupt vector index (0-based). + */ +int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (dev->msix_enabled) { + struct msi_desc *entry; + int i = 0; + + for_each_pci_msi_entry(entry, dev) { + if (i == nr) + return entry->irq; + i++; + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + if (dev->msi_enabled) { + struct msi_desc *entry = first_pci_msi_entry(dev); + + if (WARN_ON_ONCE(nr >= entry->nvec_used)) + return -EINVAL; + } else { + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + } + + return dev->irq + nr; +} +EXPORT_SYMBOL(pci_irq_vector); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..52ecd49e8049 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1237,6 +1237,10 @@ resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno); int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); +#define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ +#define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ +#define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ + /* kmem_cache style wrapper around pci_alloc_consistent() */ #include @@ -1284,6 +1288,11 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev, return rc; return 0; } +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); +void pci_free_irq_vectors(struct pci_dev *dev); +int pci_irq_vector(struct pci_dev *dev, unsigned int nr); + #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline void pci_msi_shutdown(struct pci_dev *dev) { } @@ -1307,6 +1316,24 @@ static inline int pci_enable_msix_range(struct pci_dev *dev, static inline int pci_enable_msix_exact(struct pci_dev *dev, struct msix_entry *entries, int nvec) { return -ENOSYS; } +static inline int pci_alloc_irq_vectors(struct pci_dev *dev, + unsigned int min_vecs, unsigned int max_vecs, + unsigned int flags) +{ + if (min_vecs > 1) + return -EINVAL; + return 1; +} +static inline void pci_free_irq_vectors(struct pci_dev *dev) +{ +} + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + return dev->irq; +} #endif #ifdef CONFIG_PCIEPORTBUS -- cgit v1.2.3 From 4ef33685aa0957d771e068b60a5f3ca6b47ade1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:18 +0900 Subject: PCI: Spread interrupt vectors in pci_alloc_irq_vectors() Set the affinity_mask in the PCI device before allocating vectors so that the affinity can be propagated through the MSI descriptor structures to the core IRQ code. To facilitate this, new __pci_enable_msi_range() and __pci_enable_msix_range() helpers are factored out of their not prefixed variants which assigning the new IRQ affinity mask in the PCI device so that the low-level interrupt code can perform the interrupt affinity assignment and do node-local allocations. A new PCI_IRQ_NOAFFINITY flag is added to pci_alloc_irq_vectors() so that this function can also be used by drivers that don't wish to use the automatic affinity assignment. [bhelgaas: omit "else" after "return" consistently] Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- Documentation/PCI/MSI-HOWTO.txt | 4 ++ drivers/pci/msi.c | 134 ++++++++++++++++++++++++++-------------- include/linux/pci.h | 2 + 3 files changed, 95 insertions(+), 45 deletions(-) (limited to 'include/linux/pci.h') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 0ac612b8c3fb..c55df2911136 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -99,6 +99,10 @@ PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in case the device does not support legacy interrupt lines. +By default this function will spread the interrupts around the available +CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY +flag. + To get the Linux IRQ numbers passed to request_irq() and free_irq() and the vectors, use the following function: diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 5e5ab478ea7d..a02981efdad5 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -569,6 +569,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); entry->nvec_used = nvec; + entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; @@ -680,10 +681,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct msix_entry *entries, int nvec) { + const struct cpumask *mask = NULL; struct msi_desc *entry; - int i; + int cpu = -1, i; for (i = 0; i < nvec; i++) { + if (dev->irq_affinity) { + cpu = cpumask_next(cpu, dev->irq_affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(dev->irq_affinity); + mask = cpumask_of(cpu); + } + entry = alloc_msi_entry(&dev->dev); if (!entry) { if (!i) @@ -703,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; entry->nvec_used = 1; + entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); } @@ -1028,19 +1038,8 @@ int pci_msi_enabled(void) } EXPORT_SYMBOL(pci_msi_enabled); -/** - * pci_enable_msi_range - configure device's MSI capability structure - * @dev: device to configure - * @minvec: minimal number of interrupts to configure - * @maxvec: maximum number of interrupts to configure - * - * This function tries to allocate a maximum possible number of interrupts in a - * range between @minvec and @maxvec. It returns a negative errno if an error - * occurs. If it succeeds, it returns the actual number of interrupts allocated - * and updates the @dev's irq member to the lowest new interrupt number; - * the other interrupt numbers allocated to this device are consecutive. - **/ -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, + unsigned int flags) { int nvec; int rc; @@ -1063,25 +1062,85 @@ int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; - else if (nvec < minvec) + if (nvec < minvec) return -EINVAL; - else if (nvec > maxvec) + + if (nvec > maxvec) nvec = maxvec; - do { + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) + return -ENOSPC; + } + rc = msi_capability_init(dev, nvec); - if (rc < 0) { + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) return rc; - } else if (rc > 0) { - if (rc < minvec) + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } +} + +/** + * pci_enable_msi_range - configure device's MSI capability structure + * @dev: device to configure + * @minvec: minimal number of interrupts to configure + * @maxvec: maximum number of interrupts to configure + * + * This function tries to allocate a maximum possible number of interrupts in a + * range between @minvec and @maxvec. It returns a negative errno if an error + * occurs. If it succeeds, it returns the actual number of interrupts allocated + * and updates the @dev's irq member to the lowest new interrupt number; + * the other interrupt numbers allocated to this device are consecutive. + **/ +int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +{ + return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY); +} +EXPORT_SYMBOL(pci_enable_msi_range); + +static int __pci_enable_msix_range(struct pci_dev *dev, + struct msix_entry *entries, int minvec, int maxvec, + unsigned int flags) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) return -ENOSPC; - nvec = rc; } - } while (rc); - return nvec; + rc = pci_enable_msix(dev, entries, nvec); + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) + return rc; + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } } -EXPORT_SYMBOL(pci_enable_msi_range); /** * pci_enable_msix_range - configure device's MSI-X capability structure @@ -1099,26 +1158,10 @@ EXPORT_SYMBOL(pci_enable_msi_range); * with new allocated MSI-X interrupts. **/ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) + int minvec, int maxvec) { - int nvec = maxvec; - int rc; - - if (maxvec < minvec) - return -ERANGE; - - do { - rc = pci_enable_msix(dev, entries, nvec); - if (rc < 0) { - return rc; - } else if (rc > 0) { - if (rc < minvec) - return -ENOSPC; - nvec = rc; - } - } while (rc); - - return nvec; + return __pci_enable_msix_range(dev, entries, minvec, maxvec, + PCI_IRQ_NOAFFINITY); } EXPORT_SYMBOL(pci_enable_msix_range); @@ -1145,13 +1188,14 @@ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, int vecs = -ENOSPC; if (!(flags & PCI_IRQ_NOMSIX)) { - vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); + vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs, + flags); if (vecs > 0) return vecs; } if (!(flags & PCI_IRQ_NOMSI)) { - vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); + vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags); if (vecs > 0) return vecs; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 52ecd49e8049..f1406619f868 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -320,6 +320,7 @@ struct pci_dev { * directly, use the values stored here. They might be different! */ unsigned int irq; + struct cpumask *irq_affinity; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ bool match_driver; /* Skip attaching driver */ @@ -1240,6 +1241,7 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ #define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ #define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ +#define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */ /* kmem_cache style wrapper around pci_alloc_consistent() */ -- cgit v1.2.3 From e16b46605960bd071a3e26f316e0bb600ae91e37 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 21 Jul 2016 21:40:28 -0600 Subject: PCI: Allow additional bus numbers for hotplug bridges A user may hot add a switch requiring more than one bus to enumerate. This previously required a system reboot if BIOS did not sufficiently pad the bus resource, which they frequently don't do. Add a kernel parameter so a user can specify the minimum number of bus numbers to reserve for a hotplug bridge's subordinate buses so rebooting won't be necessary. The default is 1, which is equivalent to previous behavior. Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas --- Documentation/kernel-parameters.txt | 3 +++ drivers/pci/pci.c | 8 ++++++++ drivers/pci/probe.c | 9 +++++++++ include/linux/pci.h | 1 + 4 files changed, 21 insertions(+) (limited to 'include/linux/pci.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..487e799b1da2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3016,6 +3016,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. hpmemsize=nn[KMG] The fixed amount of bus space which is reserved for hotplug bridge's memory window. Default size is 2 megabytes. + hpbussize=nn The minimum amount of additional bus numbers + reserved for buses below a hotplug bridge. + Default is 1. realloc= Enable/disable reallocating PCI bridge resources if allocations done by BIOS are too small to accommodate resources required by all child diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c8b4dbdd1bdd..f18ea90cf91a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -81,6 +81,9 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; +#define DEFAULT_HOTPLUG_BUS_SIZE 1 +unsigned long pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; + enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT; /* @@ -5021,6 +5024,11 @@ static int __init pci_setup(char *str) pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "hpbussize=", 10)) { + pci_hotplug_bus_size = + simple_strtoul(str + 10, &str, 0); + if (pci_hotplug_bus_size > 0xff) + pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; } else if (!strncmp(str, "pcie_bus_tune_off", 17)) { pcie_bus_config = PCIE_BUS_TUNE_OFF; } else if (!strncmp(str, "pcie_bus_safe", 13)) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e3ef720997d..f680099c110d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2076,6 +2076,15 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus) max = pci_scan_bridge(bus, dev, max, pass); } + /* + * Make sure a hotplug bridge has at least the minimum requested + * number of buses. + */ + if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) { + if (max - bus->busn_res.start < pci_hotplug_bus_size - 1) + max = bus->busn_res.start + pci_hotplug_bus_size - 1; + } + /* * We've scanned the bus and so we know all about what's on * the other side of any bridges that may be on this bus plus diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..0c283254111d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1706,6 +1706,7 @@ extern u8 pci_cache_line_size; extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); -- cgit v1.2.3