From 59451e1233bd315c5379a631838a03d80e689581 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Thu, 5 May 2016 17:31:47 -0700 Subject: mtd: spi-nor: change return value of read/write Change the return value of spi-nor device read and write methods to allow returning amount of data transferred and errors as read(2)/write(2) does. Also, start handling positive returns in spi_nor_read(), since we want to convert drivers to start returning the read-length both via *retlen and the return code. (We don't need to do the same transition process for spi_nor_write(), since ->write() didn't used to have a return code at all.) Signed-off-by: Michal Suchanek Signed-off-by: Brian Norris Tested-by Cyrille Pitchen Acked-by: Michal Suchanek Tested-by: Michal Suchanek --- include/linux/mtd/spi-nor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 7f041bd88b82..34680a493156 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -173,9 +173,9 @@ struct spi_nor { int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); - int (*read)(struct spi_nor *nor, loff_t from, + ssize_t (*read)(struct spi_nor *nor, loff_t from, size_t len, size_t *retlen, u_char *read_buf); - void (*write)(struct spi_nor *nor, loff_t to, + ssize_t (*write)(struct spi_nor *nor, loff_t to, size_t len, size_t *retlen, const u_char *write_buf); int (*erase)(struct spi_nor *nor, loff_t offs); -- cgit v1.2.3 From 2dd087b16946cf168f401526adf26afa771bb740 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Thu, 5 May 2016 17:31:53 -0700 Subject: mtd: spi-nor: stop passing around retlen Do not pass retlen to hardware driver read/write functions. Update it in spi-nor generic driver instead. Signed-off-by: Michal Suchanek Signed-off-by: Brian Norris Tested-by Cyrille Pitchen Acked-by: Michal Suchanek Tested-by: Michal Suchanek --- drivers/mtd/devices/m25p80.c | 7 ++----- drivers/mtd/spi-nor/fsl-quadspi.c | 17 ++++++----------- drivers/mtd/spi-nor/mtk-quadspi.c | 8 +++----- drivers/mtd/spi-nor/nxp-spifi.c | 6 ++---- drivers/mtd/spi-nor/spi-nor.c | 21 +++++++++++++-------- include/linux/mtd/spi-nor.h | 4 ++-- 6 files changed, 28 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 2ef5a6015276..eab46d211ae6 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -74,7 +74,7 @@ static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) } static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len, - size_t *retlen, const u_char *buf) + const u_char *buf) { struct m25p *flash = nor->priv; struct spi_device *spi = flash->spi; @@ -106,7 +106,6 @@ static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len, ret = m.actual_length - cmd_sz; if (ret < 0) return -EIO; - *retlen += ret; return ret; } @@ -127,7 +126,7 @@ static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor) * may be any size provided it is within the physical boundaries. */ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len, - size_t *retlen, u_char *buf) + u_char *buf) { struct m25p *flash = nor->priv; struct spi_device *spi = flash->spi; @@ -156,7 +155,6 @@ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len, msg.data_nbits = m25p80_rx_nbits(nor); ret = spi_flash_read(spi, &msg); - *retlen = msg.retlen; if (ret < 0) return ret; return msg.retlen; @@ -184,7 +182,6 @@ static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len, ret = m.actual_length - m25p_cmdsz(nor) - dummy; if (ret < 0) return -EIO; - *retlen += ret; return ret; } diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c index ea296769f015..5c82e4ef1904 100644 --- a/drivers/mtd/spi-nor/fsl-quadspi.c +++ b/drivers/mtd/spi-nor/fsl-quadspi.c @@ -620,7 +620,7 @@ static inline void fsl_qspi_invalid(struct fsl_qspi *q) static ssize_t fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor, u8 opcode, unsigned int to, u32 *txbuf, - unsigned count, size_t *retlen) + unsigned count) { int ret, i, j; u32 tmp; @@ -647,11 +647,8 @@ static ssize_t fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor, /* Trigger it */ ret = fsl_qspi_runcmd(q, opcode, to, count); - if (ret == 0) { - if (retlen) - *retlen += count; + if (ret == 0) return count; - } return ret; } @@ -862,7 +859,7 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) } else if (len > 0) { ret = fsl_qspi_nor_write(q, nor, opcode, 0, - (u32 *)buf, len, NULL); + (u32 *)buf, len); if (ret > 0) return 0; } else { @@ -874,12 +871,11 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) } static ssize_t fsl_qspi_write(struct spi_nor *nor, loff_t to, - size_t len, size_t *retlen, const u_char *buf) + size_t len, const u_char *buf) { struct fsl_qspi *q = nor->priv; - ssize_t ret = fsl_qspi_nor_write(q, nor, nor->program_opcode, to, - (u32 *)buf, len, retlen); + (u32 *)buf, len); /* invalid the data in the AHB buffer. */ fsl_qspi_invalid(q); @@ -887,7 +883,7 @@ static ssize_t fsl_qspi_write(struct spi_nor *nor, loff_t to, } static ssize_t fsl_qspi_read(struct spi_nor *nor, loff_t from, - size_t len, size_t *retlen, u_char *buf) + size_t len, u_char *buf) { struct fsl_qspi *q = nor->priv; u8 cmd = nor->read_opcode; @@ -929,7 +925,6 @@ static ssize_t fsl_qspi_read(struct spi_nor *nor, loff_t from, memcpy(buf, q->ahb_addr + q->chip_base_addr + from - q->memmap_offs, len); - *retlen += len; return len; } diff --git a/drivers/mtd/spi-nor/mtk-quadspi.c b/drivers/mtd/spi-nor/mtk-quadspi.c index e85687126c25..21c31b373cd3 100644 --- a/drivers/mtd/spi-nor/mtk-quadspi.c +++ b/drivers/mtd/spi-nor/mtk-quadspi.c @@ -244,7 +244,7 @@ static void mt8173_nor_set_addr(struct mt8173_nor *mt8173_nor, u32 addr) } static ssize_t mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length, - size_t *retlen, u_char *buffer) + u_char *buffer) { int i, ret; int addr = (int)from; @@ -255,7 +255,7 @@ static ssize_t mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length, mt8173_nor_set_read_mode(mt8173_nor); mt8173_nor_set_addr(mt8173_nor, addr); - for (i = 0; i < length; i++, (*retlen)++) { + for (i = 0; i < length; i++) { ret = mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_PIO_READ_CMD); if (ret < 0) return ret; @@ -298,7 +298,7 @@ static int mt8173_nor_write_buffer(struct mt8173_nor *mt8173_nor, int addr, } static ssize_t mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len, - size_t *retlen, const u_char *buf) + const u_char *buf) { int ret; struct mt8173_nor *mt8173_nor = nor->priv; @@ -318,7 +318,6 @@ static ssize_t mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len, } to += SFLASH_WRBUF_SIZE; buf += SFLASH_WRBUF_SIZE; - (*retlen) += SFLASH_WRBUF_SIZE; } ret = mt8173_nor_write_buffer_disable(mt8173_nor); if (ret < 0) { @@ -333,7 +332,6 @@ static ssize_t mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len, dev_err(mt8173_nor->dev, "write single byte failed!\n"); return ret; } - (*retlen) += len; } return len; diff --git a/drivers/mtd/spi-nor/nxp-spifi.c b/drivers/mtd/spi-nor/nxp-spifi.c index b0fb86996b8f..73a14f40928b 100644 --- a/drivers/mtd/spi-nor/nxp-spifi.c +++ b/drivers/mtd/spi-nor/nxp-spifi.c @@ -173,7 +173,7 @@ static int nxp_spifi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len) } static ssize_t nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len, - size_t *retlen, u_char *buf) + u_char *buf) { struct nxp_spifi *spifi = nor->priv; int ret; @@ -183,13 +183,12 @@ static ssize_t nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len, return ret; memcpy_fromio(buf, spifi->flash_base + from, len); - *retlen += len; return len; } static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len, - size_t *retlen, const u_char *buf) + const u_char *buf) { struct nxp_spifi *spifi = nor->priv; u32 cmd; @@ -201,7 +200,6 @@ static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len, return ret; writel(to, spifi->io_base + SPIFI_ADDR); - *retlen += len; cmd = SPIFI_CMD_DOUT | SPIFI_CMD_DATALEN(len) | diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index f204d29d1139..34dd95373e77 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -1031,12 +1031,13 @@ static int spi_nor_read(struct mtd_info *mtd, loff_t from, size_t len, if (ret) return ret; - ret = nor->read(nor, from, len, retlen, buf); + ret = nor->read(nor, from, len, buf); spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_READ); if (ret < 0) return ret; + *retlen += ret; return 0; } @@ -1063,7 +1064,7 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, nor->program_opcode = SPINOR_OP_BP; /* write one byte. */ - ret = nor->write(nor, to, 1, retlen, buf); + ret = nor->write(nor, to, 1, buf); if (ret < 0) goto sst_write_err; WARN(ret != 1, "While writing 1 byte written %i bytes\n", @@ -1079,7 +1080,7 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, nor->program_opcode = SPINOR_OP_AAI_WP; /* write two bytes. */ - ret = nor->write(nor, to, 2, retlen, buf + actual); + ret = nor->write(nor, to, 2, buf + actual); if (ret < 0) goto sst_write_err; WARN(ret != 2, "While writing 2 bytes written %i bytes\n", @@ -1102,7 +1103,7 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, write_enable(nor); nor->program_opcode = SPINOR_OP_BP; - ret = nor->write(nor, to, 1, retlen, buf + actual); + ret = nor->write(nor, to, 1, buf + actual); if (ret < 0) goto sst_write_err; WARN(ret != 1, "While writing 1 byte written %i bytes\n", @@ -1111,8 +1112,10 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len, if (ret) goto sst_write_err; write_disable(nor); + actual += 1; } sst_write_err: + *retlen += actual; spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE); return ret; } @@ -1141,15 +1144,17 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len, /* do all the bytes fit onto one page? */ if (page_offset + len <= nor->page_size) { - ret = nor->write(nor, to, len, retlen, buf); + ret = nor->write(nor, to, len, buf); if (ret < 0) goto write_err; + *retlen += ret; } else { /* the size of data remaining on the first page */ page_size = nor->page_size - page_offset; - ret = nor->write(nor, to, page_size, retlen, buf); + ret = nor->write(nor, to, page_size, buf); if (ret < 0) goto write_err; + *retlen += ret; /* write everything in nor->page_size chunks */ for (i = ret; i < len; ) { @@ -1163,10 +1168,10 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len, write_enable(nor); - ret = nor->write(nor, to + i, page_size, retlen, - buf + i); + ret = nor->write(nor, to + i, page_size, buf + i); if (ret < 0) goto write_err; + *retlen += ret; i += ret; } } diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 34680a493156..c425c7b4c2a0 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -174,9 +174,9 @@ struct spi_nor { int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len); ssize_t (*read)(struct spi_nor *nor, loff_t from, - size_t len, size_t *retlen, u_char *read_buf); + size_t len, u_char *read_buf); ssize_t (*write)(struct spi_nor *nor, loff_t to, - size_t len, size_t *retlen, const u_char *write_buf); + size_t len, const u_char *write_buf); int (*erase)(struct spi_nor *nor, loff_t offs); int (*flash_lock)(struct spi_nor *nor, loff_t ofs, uint64_t len); -- cgit v1.2.3 From 950334bcf17a6ab55ce13d3bdf050f7b429320d5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sat, 28 May 2016 18:09:16 -0500 Subject: PCI: Add devm_request_pci_bus_resources() Several host bridge drivers iterate through the list of bridge windows to request resources. Several others don't request the window resources at all. Add a devm_request_pci_bus_resources() interface to make it easier for drivers to request all the window resources. Export to GPL modules (from Arnd Bergmann ). Signed-off-by: Bjorn Helgaas --- drivers/pci/bus.c | 30 +++++++++++++++++++++++++++++- include/linux/pci.h | 5 ++++- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index dd7cdbee8029..6293ce0f3532 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -91,6 +91,35 @@ void pci_bus_remove_resources(struct pci_bus *bus) } } +int devm_request_pci_bus_resources(struct device *dev, + struct list_head *resources) +{ + struct resource_entry *win; + struct resource *parent, *res; + int err; + + resource_list_for_each_entry(win, resources) { + res = win->res; + switch (resource_type(res)) { + case IORESOURCE_IO: + parent = &ioport_resource; + break; + case IORESOURCE_MEM: + parent = &iomem_resource; + break; + default: + continue; + } + + err = devm_request_resource(dev, parent, res); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources); + static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL}; #ifdef CONFIG_PCI_BUS_ADDR_T_64BIT static struct pci_bus_region pci_64_bit = {0, @@ -397,4 +426,3 @@ void pci_bus_put(struct pci_bus *bus) put_device(&bus->dev); } EXPORT_SYMBOL(pci_bus_put); - diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..6ac83606cb0d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1143,9 +1143,12 @@ void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); void pci_free_resource_list(struct list_head *resources); -void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, unsigned int flags); +void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, + unsigned int flags); struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n); void pci_bus_remove_resources(struct pci_bus *bus); +int devm_request_pci_bus_resources(struct device *dev, + struct list_head *resources); #define pci_bus_for_each_resource(bus, res, i) \ for (i = 0; \ -- cgit v1.2.3 From 1c7fe6b438433e16d5b1211380c305351ac38299 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Thu, 9 Jun 2016 20:10:11 +0200 Subject: mtd: nand: add ESMT manufacturer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I got device with ESMT (Elite Semiconductor Memory Technology Inc) F59L1G81MA flash that was detected as: [ 0.852034] nand: device found, Manufacturer ID: 0xc8, Chip ID: 0xd1 [ 0.858402] nand: Unknown NAND 128MiB 3,3V 8-bit [ 0.863031] nand: 128MiB, SLC, page size: 2048, OOB size: 64 According to the F59L1G81MA datasheet (and Read Id documentation) C8h is a "Maker Code" which should mean ESMT. Add it to fix above "Unknown". Signed-off-by: Rafał Miłecki Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_ids.c | 1 + include/linux/mtd/nand.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c index ccc05f5b2695..2af9869a115e 100644 --- a/drivers/mtd/nand/nand_ids.c +++ b/drivers/mtd/nand/nand_ids.c @@ -168,6 +168,7 @@ struct nand_flash_dev nand_flash_ids[] = { /* Manufacturer IDs */ struct nand_manufacturers nand_manuf_ids[] = { {NAND_MFR_TOSHIBA, "Toshiba"}, + {NAND_MFR_ESMT, "ESMT"}, {NAND_MFR_SAMSUNG, "Samsung"}, {NAND_MFR_FUJITSU, "Fujitsu"}, {NAND_MFR_NATIONAL, "National"}, diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index fbe8e164a4ee..8dd6e01f45c0 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -783,6 +783,7 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv) * NAND Flash Manufacturer ID Codes */ #define NAND_MFR_TOSHIBA 0x98 +#define NAND_MFR_ESMT 0xc8 #define NAND_MFR_SAMSUNG 0xec #define NAND_MFR_FUJITSU 0x04 #define NAND_MFR_NATIONAL 0x8f -- cgit v1.2.3 From 80955f9ee51760db64795b3778365ccb8c3a2729 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 10 Jun 2016 21:55:09 +0200 Subject: PCI: Move ecam.h to linux/include/pci-ecam.h This header will be used from arch/arm64 for ACPI PCI implementation so it needs to be moved out of drivers/pci. Update users of the header file to use the new name. No functional changes. Signed-off-by: Jayachandran C Signed-off-by: Bjorn Helgaas Acked-by: Lorenzo Pieralisi --- drivers/pci/ecam.c | 3 +- drivers/pci/ecam.h | 67 ------------------------------------- drivers/pci/host/pci-host-common.c | 3 +- drivers/pci/host/pci-host-generic.c | 3 +- drivers/pci/host/pci-thunder-ecam.c | 3 +- drivers/pci/host/pci-thunder-pem.c | 3 +- include/linux/pci-ecam.h | 67 +++++++++++++++++++++++++++++++++++++ 7 files changed, 72 insertions(+), 77 deletions(-) delete mode 100644 drivers/pci/ecam.h create mode 100644 include/linux/pci-ecam.h (limited to 'include/linux') diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index f9832ad8efe2..820e26b473f4 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -19,10 +19,9 @@ #include #include #include +#include #include -#include "ecam.h" - /* * On 64-bit systems, we do a single ioremap for the whole config space * since we have enough virtual address range available. On 32-bit, we diff --git a/drivers/pci/ecam.h b/drivers/pci/ecam.h deleted file mode 100644 index 9878bebd45bb..000000000000 --- a/drivers/pci/ecam.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2016 Broadcom - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation (the "GPL"). - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 (GPLv2) for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 (GPLv2) along with this source code. - */ -#ifndef DRIVERS_PCI_ECAM_H -#define DRIVERS_PCI_ECAM_H - -#include -#include - -/* - * struct to hold pci ops and bus shift of the config window - * for a PCI controller. - */ -struct pci_config_window; -struct pci_ecam_ops { - unsigned int bus_shift; - struct pci_ops pci_ops; - int (*init)(struct device *, - struct pci_config_window *); -}; - -/* - * struct to hold the mappings of a config space window. This - * is expected to be used as sysdata for PCI controllers that - * use ECAM. - */ -struct pci_config_window { - struct resource res; - struct resource busr; - void *priv; - struct pci_ecam_ops *ops; - union { - void __iomem *win; /* 64-bit single mapping */ - void __iomem **winp; /* 32-bit per-bus mapping */ - }; -}; - -/* create and free pci_config_window */ -struct pci_config_window *pci_ecam_create(struct device *dev, - struct resource *cfgres, struct resource *busr, - struct pci_ecam_ops *ops); -void pci_ecam_free(struct pci_config_window *cfg); - -/* map_bus when ->sysdata is an instance of pci_config_window */ -void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, - int where); -/* default ECAM ops */ -extern struct pci_ecam_ops pci_generic_ecam_ops; - -#ifdef CONFIG_PCI_HOST_GENERIC -/* for DT-based PCI controllers that support ECAM */ -int pci_host_common_probe(struct platform_device *pdev, - struct pci_ecam_ops *ops); -#endif -#endif diff --git a/drivers/pci/host/pci-host-common.c b/drivers/pci/host/pci-host-common.c index 8cba7ab73df9..c18b9e3bb8bd 100644 --- a/drivers/pci/host/pci-host-common.c +++ b/drivers/pci/host/pci-host-common.c @@ -20,10 +20,9 @@ #include #include #include +#include #include -#include "../ecam.h" - static int gen_pci_parse_request_of_pci_ranges(struct device *dev, struct list_head *resources, struct resource **bus_range) { diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c index 6eaceab1bf04..f0ca6de0d87e 100644 --- a/drivers/pci/host/pci-host-generic.c +++ b/drivers/pci/host/pci-host-generic.c @@ -23,10 +23,9 @@ #include #include #include +#include #include -#include "../ecam.h" - static struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = { .bus_shift = 16, .pci_ops = { diff --git a/drivers/pci/host/pci-thunder-ecam.c b/drivers/pci/host/pci-thunder-ecam.c index 540d030613eb..a9fc1c9105fd 100644 --- a/drivers/pci/host/pci-thunder-ecam.c +++ b/drivers/pci/host/pci-thunder-ecam.c @@ -11,10 +11,9 @@ #include #include #include +#include #include -#include "../ecam.h" - static void set_val(u32 v, int where, int size, u32 *val) { int shift = (where & 3) * 8; diff --git a/drivers/pci/host/pci-thunder-pem.c b/drivers/pci/host/pci-thunder-pem.c index 9b8ab94f3c8c..5020d3d61ba7 100644 --- a/drivers/pci/host/pci-thunder-pem.c +++ b/drivers/pci/host/pci-thunder-pem.c @@ -18,10 +18,9 @@ #include #include #include +#include #include -#include "../ecam.h" - #define PEM_CFG_WR 0x28 #define PEM_CFG_RD 0x30 diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h new file mode 100644 index 000000000000..9878bebd45bb --- /dev/null +++ b/include/linux/pci-ecam.h @@ -0,0 +1,67 @@ +/* + * Copyright 2016 Broadcom + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation (the "GPL"). + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 (GPLv2) for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 (GPLv2) along with this source code. + */ +#ifndef DRIVERS_PCI_ECAM_H +#define DRIVERS_PCI_ECAM_H + +#include +#include + +/* + * struct to hold pci ops and bus shift of the config window + * for a PCI controller. + */ +struct pci_config_window; +struct pci_ecam_ops { + unsigned int bus_shift; + struct pci_ops pci_ops; + int (*init)(struct device *, + struct pci_config_window *); +}; + +/* + * struct to hold the mappings of a config space window. This + * is expected to be used as sysdata for PCI controllers that + * use ECAM. + */ +struct pci_config_window { + struct resource res; + struct resource busr; + void *priv; + struct pci_ecam_ops *ops; + union { + void __iomem *win; /* 64-bit single mapping */ + void __iomem **winp; /* 32-bit per-bus mapping */ + }; +}; + +/* create and free pci_config_window */ +struct pci_config_window *pci_ecam_create(struct device *dev, + struct resource *cfgres, struct resource *busr, + struct pci_ecam_ops *ops); +void pci_ecam_free(struct pci_config_window *cfg); + +/* map_bus when ->sysdata is an instance of pci_config_window */ +void __iomem *pci_ecam_map_bus(struct pci_bus *bus, unsigned int devfn, + int where); +/* default ECAM ops */ +extern struct pci_ecam_ops pci_generic_ecam_ops; + +#ifdef CONFIG_PCI_HOST_GENERIC +/* for DT-based PCI controllers that support ECAM */ +int pci_host_common_probe(struct platform_device *pdev, + struct pci_ecam_ops *ops); +#endif +#endif -- cgit v1.2.3 From 5c3d14f76fccd14f0882fe2a11e19534a11a1674 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 10 Jun 2016 21:55:10 +0200 Subject: PCI: Add parent device field to ECAM struct pci_config_window Add a parent device field to struct pci_config_window. The parent is not saved now, but will be useful to save it in some cases. For ACPI on ARM64, it can be used to setup ACPI companion and domain. Since the parent dev is in struct pci_config_window now, we need not pass it to the init function as a separate argument. Signed-off-by: Jayachandran C Signed-off-by: Bjorn Helgaas Acked-by: Lorenzo Pieralisi --- drivers/pci/ecam.c | 3 ++- drivers/pci/host/pci-thunder-pem.c | 3 ++- include/linux/pci-ecam.h | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c index 820e26b473f4..66e0d718472f 100644 --- a/drivers/pci/ecam.c +++ b/drivers/pci/ecam.c @@ -51,6 +51,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev, if (!cfg) return ERR_PTR(-ENOMEM); + cfg->parent = dev; cfg->ops = ops; cfg->busr.start = busr->start; cfg->busr.end = busr->end; @@ -94,7 +95,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev, } if (ops->init) { - err = ops->init(dev, cfg); + err = ops->init(cfg); if (err) goto err_exit; } diff --git a/drivers/pci/host/pci-thunder-pem.c b/drivers/pci/host/pci-thunder-pem.c index 5020d3d61ba7..91f6fc68d374 100644 --- a/drivers/pci/host/pci-thunder-pem.c +++ b/drivers/pci/host/pci-thunder-pem.c @@ -284,8 +284,9 @@ static int thunder_pem_config_write(struct pci_bus *bus, unsigned int devfn, return pci_generic_config_write(bus, devfn, where, size, val); } -static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg) +static int thunder_pem_init(struct pci_config_window *cfg) { + struct device *dev = cfg->parent; resource_size_t bar4_start; struct resource *res_pem; struct thunder_pem_pci *pem_pci; diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 9878bebd45bb..7adad206b1f4 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -27,8 +27,7 @@ struct pci_config_window; struct pci_ecam_ops { unsigned int bus_shift; struct pci_ops pci_ops; - int (*init)(struct device *, - struct pci_config_window *); + int (*init)(struct pci_config_window *); }; /* @@ -45,6 +44,7 @@ struct pci_config_window { void __iomem *win; /* 64-bit single mapping */ void __iomem **winp; /* 32-bit per-bus mapping */ }; + struct device *parent;/* ECAM res was from this dev */ }; /* create and free pci_config_window */ -- cgit v1.2.3 From 4d3f13845957a87729a324cce8509fad8826ef52 Mon Sep 17 00:00:00 2001 From: Sinan Kaya Date: Fri, 10 Jun 2016 21:55:11 +0200 Subject: PCI: Add pci_unmap_iospace() to unmap I/O resources Add pci_unmap_iospace() to undo what pci_remap_iospace() did. This is needed to support hotplug removal of host bridges that use pci_remap_iospace(). [bhelgaas: changelog] Signed-off-by: Sinan Kaya Signed-off-by: Tomasz Nowicki Signed-off-by: Bjorn Helgaas Acked-by: Lorenzo Pieralisi --- drivers/pci/pci.c | 18 ++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c8b4dbdd1bdd..eb431b5c3685 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include "pci.h" @@ -3165,6 +3166,23 @@ int __weak pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr) #endif } +/** + * pci_unmap_iospace - Unmap the memory mapped I/O space + * @res: resource to be unmapped + * + * Unmap the CPU virtual address @res from virtual address space. + * Only architectures that have memory mapped IO functions defined + * (and the PCI_IOBASE value defined) should call this function. + */ +void pci_unmap_iospace(struct resource *res) +{ +#if defined(PCI_IOBASE) && defined(CONFIG_MMU) + unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start; + + unmap_kernel_range(vaddr, resource_size(res)); +#endif +} + static void __pci_set_master(struct pci_dev *dev, bool enable) { u16 old_cmd, cmd; diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..12349deb688c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1167,6 +1167,7 @@ int pci_register_io_range(phys_addr_t addr, resource_size_t size); unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); +void pci_unmap_iospace(struct resource *res); static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar) { -- cgit v1.2.3 From 935c760ec8101413248da23b6df45f0a7a643c62 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Fri, 10 Jun 2016 21:55:13 +0200 Subject: PCI/ACPI: Add generic MCFG table handling On ACPI systems that support memory-mapped config space access, i.e., ECAM, the PCI Firmware Specification says the OS can learn where the ECAM space is from either: - the static MCFG table (for non-hotpluggable bridges), or - the _CBA method (for hotpluggable bridges) The current MCFG table handling code cannot be easily generalized owing to x86-specific quirks, which makes it hard to reuse on other architectures. Implement generic MCFG handling from scratch, including: - Simple MCFG table parsing (via pci_mmcfg_late_init() as in current x86) - MCFG region lookup for a (domain, bus_start, bus_end) tuple [bhelgaas: changelog] Signed-off-by: Tomasz Nowicki Signed-off-by: Jayachandran C Signed-off-by: Bjorn Helgaas Reviewed-by: Lorenzo Pieralisi --- drivers/acpi/Kconfig | 3 ++ drivers/acpi/Makefile | 1 + drivers/acpi/pci_mcfg.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci-acpi.h | 2 ++ include/linux/pci.h | 2 +- 5 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 drivers/acpi/pci_mcfg.c (limited to 'include/linux') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index b7e2e776397d..f98c3287256e 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -217,6 +217,9 @@ config ACPI_PROCESSOR_IDLE bool select CPU_IDLE +config ACPI_MCFG + bool + config ACPI_CPPC_LIB bool depends on ACPI_PROCESSOR diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 251ce85a66fb..632e81feef69 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -40,6 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o acpi-y += ec.o acpi-$(CONFIG_ACPI_DOCK) += dock.o acpi-y += pci_root.o pci_link.o pci_irq.o +obj-$(CONFIG_ACPI_MCFG) += pci_mcfg.o acpi-y += acpi_lpss.o acpi_apd.o acpi-y += acpi_platform.o acpi-y += acpi_pnp.o diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c new file mode 100644 index 000000000000..b5b376e081f5 --- /dev/null +++ b/drivers/acpi/pci_mcfg.c @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2016 Broadcom + * Author: Jayachandran C + * Copyright (C) 2016 Semihalf + * Author: Tomasz Nowicki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation (the "GPL"). + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 (GPLv2) for more details. + * + * You should have received a copy of the GNU General Public License + * version 2 (GPLv2) along with this source code. + */ + +#define pr_fmt(fmt) "ACPI: " fmt + +#include +#include +#include + +/* Structure to hold entries from the MCFG table */ +struct mcfg_entry { + struct list_head list; + phys_addr_t addr; + u16 segment; + u8 bus_start; + u8 bus_end; +}; + +/* List to save MCFG entries */ +static LIST_HEAD(pci_mcfg_list); + +phys_addr_t pci_mcfg_lookup(u16 seg, struct resource *bus_res) +{ + struct mcfg_entry *e; + + /* + * We expect exact match, unless MCFG entry end bus covers more than + * specified by caller. + */ + list_for_each_entry(e, &pci_mcfg_list, list) { + if (e->segment == seg && e->bus_start == bus_res->start && + e->bus_end >= bus_res->end) + return e->addr; + } + + return 0; +} + +static __init int pci_mcfg_parse(struct acpi_table_header *header) +{ + struct acpi_table_mcfg *mcfg; + struct acpi_mcfg_allocation *mptr; + struct mcfg_entry *e, *arr; + int i, n; + + if (header->length < sizeof(struct acpi_table_mcfg)) + return -EINVAL; + + n = (header->length - sizeof(struct acpi_table_mcfg)) / + sizeof(struct acpi_mcfg_allocation); + mcfg = (struct acpi_table_mcfg *)header; + mptr = (struct acpi_mcfg_allocation *) &mcfg[1]; + + arr = kcalloc(n, sizeof(*arr), GFP_KERNEL); + if (!arr) + return -ENOMEM; + + for (i = 0, e = arr; i < n; i++, mptr++, e++) { + e->segment = mptr->pci_segment; + e->addr = mptr->address; + e->bus_start = mptr->start_bus_number; + e->bus_end = mptr->end_bus_number; + list_add(&e->list, &pci_mcfg_list); + } + + pr_info("MCFG table detected, %d entries\n", n); + return 0; +} + +/* Interface called by ACPI - parse and save MCFG table */ +void __init pci_mmcfg_late_init(void) +{ + int err = acpi_table_parse(ACPI_SIG_MCFG, pci_mcfg_parse); + if (err) + pr_err("Failed to parse MCFG (%d)\n", err); +} diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 89ab0572dbc6..7d63a66e8ed4 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -24,6 +24,8 @@ static inline acpi_status pci_acpi_remove_pm_notifier(struct acpi_device *dev) } extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle); +extern phys_addr_t pci_mcfg_lookup(u16 domain, struct resource *bus_res); + static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev) { struct pci_bus *pbus = pdev->bus; diff --git a/include/linux/pci.h b/include/linux/pci.h index 12349deb688c..ce03d650279b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1723,7 +1723,7 @@ void pcibios_free_irq(struct pci_dev *dev); extern struct dev_pm_ops pcibios_pm_ops; #endif -#ifdef CONFIG_PCI_MMCONFIG +#if defined(CONFIG_PCI_MMCONFIG) || defined(CONFIG_ACPI_MCFG) void __init pci_mmcfg_early_init(void); void __init pci_mmcfg_late_init(void); #else -- cgit v1.2.3 From 9c7cb891ecfea3b88e4fa255afeec0da84ea6a86 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Fri, 10 Jun 2016 21:55:14 +0200 Subject: PCI: Refactor pci_bus_assign_domain_nr() for CONFIG_PCI_DOMAINS_GENERIC Instead of assigning bus->domain_nr inside pci_bus_assign_domain_nr(), return the domain and let the caller do the assignment. Rename pci_bus_assign_domain_nr() to pci_bus_find_domain_nr() to reflect this. No functional change intended. [bhelgaas: changelog] Signed-off-by: Tomasz Nowicki Signed-off-by: Bjorn Helgaas Reviewed-by: Lorenzo Pieralisi --- drivers/pci/pci.c | 4 ++-- drivers/pci/probe.c | 4 +++- include/linux/pci.h | 7 +------ 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index eb431b5c3685..b9a783385eae 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4941,7 +4941,7 @@ int pci_get_new_domain_nr(void) } #ifdef CONFIG_PCI_DOMAINS_GENERIC -void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent) +int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) { static int use_dt_domains = -1; int domain = -1; @@ -4985,7 +4985,7 @@ void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent) domain = -1; } - bus->domain_nr = domain; + return domain; } #endif #endif diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e3ef720997d..380d46dc9a70 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2127,7 +2127,9 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus, b->sysdata = sysdata; b->ops = ops; b->number = b->busn_res.start = bus; - pci_bus_assign_domain_nr(b, parent); +#ifdef CONFIG_PCI_DOMAINS_GENERIC + b->domain_nr = pci_bus_find_domain_nr(b, parent); +#endif b2 = pci_find_bus(pci_domain_nr(b), bus); if (b2) { /* If we already got to this bus through a different bridge, ignore it */ diff --git a/include/linux/pci.h b/include/linux/pci.h index ce03d650279b..48839e817ef2 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1390,12 +1390,7 @@ static inline int pci_domain_nr(struct pci_bus *bus) { return bus->domain_nr; } -void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent); -#else -static inline void pci_bus_assign_domain_nr(struct pci_bus *bus, - struct device *parent) -{ -} +int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); #endif /* some architectures require additional setup to direct VGA traffic */ -- cgit v1.2.3 From 2ab51ddeca2fc32a7040d8560415be3366fa9ba7 Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Fri, 10 Jun 2016 15:36:26 -0500 Subject: ARM64: PCI: Add acpi_pci_bus_find_domain_nr() Extend pci_bus_find_domain_nr() so it can find the domain from either: - ACPI, via the new acpi_pci_bus_find_domain_nr() interface, or - DT, via of_pci_bus_find_domain_nr() Note that this is only used for CONFIG_PCI_DOMAINS_GENERIC=y, so it does not affect x86 or ia64. [bhelgaas: changelog] Signed-off-by: Tomasz Nowicki Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas --- arch/arm64/kernel/pci.c | 7 +++++++ drivers/pci/pci.c | 4 +++- include/linux/pci.h | 6 ++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c index 3c4e308b40a0..d5d3d26834cf 100644 --- a/arch/arm64/kernel/pci.c +++ b/arch/arm64/kernel/pci.c @@ -17,6 +17,7 @@ #include #include #include +#include #include /* @@ -85,6 +86,12 @@ EXPORT_SYMBOL(pcibus_to_node); #endif #ifdef CONFIG_ACPI + +int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +{ + return 0; +} + /* Root bridge scanning */ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 97f7cd4a7e86..4834ceeca0d2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -7,6 +7,7 @@ * Copyright 1997 -- 2000 Martin Mares */ +#include #include #include #include @@ -4990,7 +4991,8 @@ static int of_pci_bus_find_domain_nr(struct device *parent) int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent) { - return of_pci_bus_find_domain_nr(parent); + return acpi_disabled ? of_pci_bus_find_domain_nr(parent) : + acpi_pci_bus_find_domain_nr(bus); } #endif #endif diff --git a/include/linux/pci.h b/include/linux/pci.h index 48839e817ef2..0671e9a840cb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1390,6 +1390,12 @@ static inline int pci_domain_nr(struct pci_bus *bus) { return bus->domain_nr; } +#ifdef CONFIG_ACPI +int acpi_pci_bus_find_domain_nr(struct pci_bus *bus); +#else +static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) +{ return 0; } +#endif int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); #endif -- cgit v1.2.3 From 9661e783d830699a65f7d42e4b4a76e6923089eb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 7 Jun 2016 20:48:33 +0300 Subject: PCI / PM: Enforce type casting for pci_power_t When casting variables of type pci_power_t, a static analysis tool complains: include/linux/pci.h:119:37: warning: cast from restricted pci_power_t Enforce type casting to make the static analyzer happy. Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..8d748345b158 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -116,7 +116,7 @@ extern const char *pci_power_names[]; static inline const char *pci_power_name(pci_power_t state) { - return pci_power_names[1 + (int) state]; + return pci_power_names[1 + (__force int) state]; } #define PCI_PM_D2_DELAY 200 -- cgit v1.2.3 From 9d26d3a8f1b0c442339a235f9508bdad8af91043 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 2 Jun 2016 11:17:12 +0300 Subject: PCI: Put PCIe ports into D3 during suspend Currently the Linux PCI core does not touch power state of PCI bridges and PCIe ports when system suspend is entered. Leaving them in D0 consumes power unnecessarily and may prevent the CPU from entering deeper C-states. With recent PCIe hardware we can power down the ports to save power given that we take into account few restrictions: - The PCIe port hardware is recent enough, starting from 2015. - Devices connected to PCIe ports are effectively in D3cold once the port is transitioned to D3 (the config space is not accessible anymore and the link may be powered down). - Devices behind the PCIe port need to be allowed to transition to D3cold and back. There is a way both drivers and userspace can forbid this. - If the device behind the PCIe port is capable of waking the system it needs to be able to do so from D3cold. This patch adds a new flag to struct pci_device called 'bridge_d3'. This flag is set and cleared by the PCI core whenever there is a change in power management state of any of the devices behind the PCIe port. When system later on is suspended we only need to check this flag and if it is true transition the port to D3 otherwise we leave it in D0. Also provide override mechanism via command line parameter "pcie_port_pm=[off|force]" that can be used to disable or enable the feature regardless of the BIOS manufacturing date. Tested-by: Lukas Wunner Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 4 + drivers/pci/bus.c | 1 + drivers/pci/pci-driver.c | 5 +- drivers/pci/pci-sysfs.c | 5 ++ drivers/pci/pci.c | 175 ++++++++++++++++++++++++++++++++++++ drivers/pci/pci.h | 11 +++ drivers/pci/remove.c | 2 + drivers/usb/host/xhci-pci.c | 2 +- include/linux/pci.h | 3 + 9 files changed, 203 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..86edee4cedd4 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3047,6 +3047,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. compat Treat PCIe ports as PCI-to-PCI bridges, disable the PCIe ports driver. + pcie_port_pm= [PCIE] PCIe port power management handling: + off Disable power management of all PCIe ports + force Forcibly enable power management of all PCIe ports + pcie_pme= [PCIE,PM] Native PCIe PME signaling options: nomsi Do not use MSI for native PCIe PME signaling (this makes all PCIe root ports use INTx for all services). diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index dd7cdbee8029..28731360b457 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -291,6 +291,7 @@ void pci_bus_add_device(struct pci_dev *dev) pci_fixup_device(pci_fixup_final, dev); pci_create_sysfs_dev_files(dev); pci_proc_attach_device(dev); + pci_bridge_d3_device_changed(dev); dev->match_driver = true; retval = device_attach(&dev->dev); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d7ffd66814bb..e39a67c8ef39 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -777,7 +777,7 @@ static int pci_pm_suspend_noirq(struct device *dev) if (!pci_dev->state_saved) { pci_save_state(pci_dev); - if (!pci_has_subordinate(pci_dev)) + if (pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } @@ -1144,7 +1144,6 @@ static int pci_pm_runtime_suspend(struct device *dev) return -ENOSYS; pci_dev->state_saved = false; - pci_dev->no_d3cold = false; error = pm->runtime_suspend(dev); if (error) { /* @@ -1161,8 +1160,6 @@ static int pci_pm_runtime_suspend(struct device *dev) return error; } - if (!pci_dev->d3cold_allowed) - pci_dev->no_d3cold = true; pci_fixup_device(pci_fixup_suspend, pci_dev); diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index d319a9ca9b7b..bcd10c795284 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -406,6 +406,11 @@ static ssize_t d3cold_allowed_store(struct device *dev, return -EINVAL; pdev->d3cold_allowed = !!val; + if (pdev->d3cold_allowed) + pci_d3cold_enable(pdev); + else + pci_d3cold_disable(pdev); + pm_runtime_resume(dev); return count; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c8b4dbdd1bdd..9ff7183e25a2 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -101,6 +102,21 @@ unsigned int pcibios_max_latency = 255; /* If set, the PCIe ARI capability will not be used. */ static bool pcie_ari_disabled; +/* Disable bridge_d3 for all PCIe ports */ +static bool pci_bridge_d3_disable; +/* Force bridge_d3 for all PCIe ports */ +static bool pci_bridge_d3_force; + +static int __init pcie_port_pm_setup(char *str) +{ + if (!strcmp(str, "off")) + pci_bridge_d3_disable = true; + else if (!strcmp(str, "force")) + pci_bridge_d3_force = true; + return 1; +} +__setup("pcie_port_pm=", pcie_port_pm_setup); + /** * pci_bus_max_busnr - returns maximum PCI bus number of given bus' children * @bus: pointer to PCI bus structure to search @@ -2155,6 +2171,164 @@ void pci_config_pm_runtime_put(struct pci_dev *pdev) pm_runtime_put_sync(parent); } +/** + * pci_bridge_d3_possible - Is it possible to put the bridge into D3 + * @bridge: Bridge to check + * + * This function checks if it is possible to move the bridge to D3. + * Currently we only allow D3 for recent enough PCIe ports. + */ +static bool pci_bridge_d3_possible(struct pci_dev *bridge) +{ + unsigned int year; + + if (!pci_is_pcie(bridge)) + return false; + + switch (pci_pcie_type(bridge)) { + case PCI_EXP_TYPE_ROOT_PORT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_DOWNSTREAM: + if (pci_bridge_d3_disable) + return false; + if (pci_bridge_d3_force) + return true; + + /* + * It should be safe to put PCIe ports from 2015 or newer + * to D3. + */ + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2015) { + return true; + } + break; + } + + return false; +} + +static int pci_dev_check_d3cold(struct pci_dev *dev, void *data) +{ + bool *d3cold_ok = data; + bool no_d3cold; + + /* + * The device needs to be allowed to go D3cold and if it is wake + * capable to do so from D3cold. + */ + no_d3cold = dev->no_d3cold || !dev->d3cold_allowed || + (device_may_wakeup(&dev->dev) && !pci_pme_capable(dev, PCI_D3cold)) || + !pci_power_manageable(dev); + + *d3cold_ok = !no_d3cold; + + return no_d3cold; +} + +/* + * pci_bridge_d3_update - Update bridge D3 capabilities + * @dev: PCI device which is changed + * @remove: Is the device being removed + * + * Update upstream bridge PM capabilities accordingly depending on if the + * device PM configuration was changed or the device is being removed. The + * change is also propagated upstream. + */ +static void pci_bridge_d3_update(struct pci_dev *dev, bool remove) +{ + struct pci_dev *bridge; + bool d3cold_ok = true; + + bridge = pci_upstream_bridge(dev); + if (!bridge || !pci_bridge_d3_possible(bridge)) + return; + + pci_dev_get(bridge); + /* + * If the device is removed we do not care about its D3cold + * capabilities. + */ + if (!remove) + pci_dev_check_d3cold(dev, &d3cold_ok); + + if (d3cold_ok) { + /* + * We need to go through all children to find out if all of + * them can still go to D3cold. + */ + pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold, + &d3cold_ok); + } + + if (bridge->bridge_d3 != d3cold_ok) { + bridge->bridge_d3 = d3cold_ok; + /* Propagate change to upstream bridges */ + pci_bridge_d3_update(bridge, false); + } + + pci_dev_put(bridge); +} + +/** + * pci_bridge_d3_device_changed - Update bridge D3 capabilities on change + * @dev: PCI device that was changed + * + * If a device is added or its PM configuration, such as is it allowed to + * enter D3cold, is changed this function updates upstream bridge PM + * capabilities accordingly. + */ +void pci_bridge_d3_device_changed(struct pci_dev *dev) +{ + pci_bridge_d3_update(dev, false); +} + +/** + * pci_bridge_d3_device_removed - Update bridge D3 capabilities on remove + * @dev: PCI device being removed + * + * Function updates upstream bridge PM capabilities based on other devices + * still left on the bus. + */ +void pci_bridge_d3_device_removed(struct pci_dev *dev) +{ + pci_bridge_d3_update(dev, true); +} + +/** + * pci_d3cold_enable - Enable D3cold for device + * @dev: PCI device to handle + * + * This function can be used in drivers to enable D3cold from the device + * they handle. It also updates upstream PCI bridge PM capabilities + * accordingly. + */ +void pci_d3cold_enable(struct pci_dev *dev) +{ + if (dev->no_d3cold) { + dev->no_d3cold = false; + pci_bridge_d3_device_changed(dev); + } +} +EXPORT_SYMBOL_GPL(pci_d3cold_enable); + +/** + * pci_d3cold_disable - Disable D3cold for device + * @dev: PCI device to handle + * + * This function can be used in drivers to disable D3cold from the device + * they handle. It also updates upstream PCI bridge PM capabilities + * accordingly. + */ +void pci_d3cold_disable(struct pci_dev *dev) +{ + if (!dev->no_d3cold) { + dev->no_d3cold = true; + pci_bridge_d3_device_changed(dev); + } +} +EXPORT_SYMBOL_GPL(pci_d3cold_disable); + /** * pci_pm_init - Initialize PM functions of given PCI device * @dev: PCI device to handle. @@ -2189,6 +2363,7 @@ void pci_pm_init(struct pci_dev *dev) dev->pm_cap = pm; dev->d3_delay = PCI_PM_D3_WAIT; dev->d3cold_delay = PCI_PM_D3COLD_WAIT; + dev->bridge_d3 = pci_bridge_d3_possible(dev); dev->d3cold_allowed = true; dev->d1_support = false; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index a814bbb80fcb..9730c474b016 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -82,6 +82,8 @@ void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); +void pci_bridge_d3_device_changed(struct pci_dev *dev); +void pci_bridge_d3_device_removed(struct pci_dev *dev); static inline void pci_wakeup_event(struct pci_dev *dev) { @@ -94,6 +96,15 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev) return !!(pci_dev->subordinate); } +static inline bool pci_power_manageable(struct pci_dev *pci_dev) +{ + /* + * Currently we allow normal PCI devices and PCI bridges transition + * into D3 if their bridge_d3 is set. + */ + return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; +} + struct pci_vpd_ops { ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 8982026637d5..d1ef7acf6930 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -96,6 +96,8 @@ static void pci_remove_bus_device(struct pci_dev *dev) dev->subordinate = NULL; } + pci_bridge_d3_device_removed(dev); + pci_destroy_dev(dev); } diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 48672fac7ff3..ac352fe391f4 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -382,7 +382,7 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup) * need to have the registers polled during D3, so avoid D3cold. */ if (xhci->quirks & XHCI_COMP_MODE_QUIRK) - pdev->no_d3cold = true; + pci_d3cold_disable(pdev); if (xhci->quirks & XHCI_PME_STUCK_QUIRK) xhci_pme_quirk(hcd); diff --git a/include/linux/pci.h b/include/linux/pci.h index 8d748345b158..8597b423cb63 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -294,6 +294,7 @@ struct pci_dev { unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ unsigned int no_d3cold:1; /* D3cold is forbidden */ + unsigned int bridge_d3:1; /* Allow D3 for bridge */ unsigned int d3cold_allowed:1; /* D3cold is allowed by user */ unsigned int mmio_always_on:1; /* disallow turning off io/mem decoding during bar sizing */ @@ -1083,6 +1084,8 @@ int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); +void pci_d3cold_enable(struct pci_dev *dev); +void pci_d3cold_disable(struct pci_dev *dev); static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable) -- cgit v1.2.3 From 6c7caebc26c5f0b618f0ef6b851e9f5f27c3812f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Jun 2016 14:48:25 +0200 Subject: KVM: introduce kvm->created_vcpus The race between creating the irqchip and the first VCPU is currently fixed by checking the presence of an irqchip before updating kvm->online_vcpus, and undoing the whole VCPU creation if someone created the irqchip in the meanwhile. Instead, introduce a new field in struct kvm that will count VCPUs under a mutex, without the atomic access and memory ordering that we need elsewhere to protect the vcpus array. This also plugs the race and is more easily applicable in all similar circumstances. Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ virt/kvm/kvm_main.c | 23 +++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c9c973a7dd9..63c6ab30bc81 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -371,7 +371,15 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + + /* + * created_vcpus is protected by kvm->lock, and is incremented + * at the beginning of KVM_CREATE_VCPU. online_vcpus is only + * incremented after storing the kvm_vcpu pointer in vcpus, + * and is accessed atomically. + */ atomic_t online_vcpus; + int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 02e98f3131bd..15b757ae64e1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2346,9 +2346,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (id >= KVM_MAX_VCPU_ID) return -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus == KVM_MAX_VCPUS) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + kvm->created_vcpus++; + mutex_unlock(&kvm->lock); + vcpu = kvm_arch_vcpu_create(kvm, id); - if (IS_ERR(vcpu)) - return PTR_ERR(vcpu); + if (IS_ERR(vcpu)) { + r = PTR_ERR(vcpu); + goto vcpu_decrement; + } preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); @@ -2361,10 +2372,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) r = -EINVAL; goto unlock_vcpu_destroy; } - if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { - r = -EINVAL; - goto unlock_vcpu_destroy; - } if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; @@ -2397,6 +2404,10 @@ unlock_vcpu_destroy: mutex_unlock(&kvm->lock); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); +vcpu_decrement: + mutex_lock(&kvm->lock); + kvm->created_vcpus--; + mutex_unlock(&kvm->lock); return r; } -- cgit v1.2.3 From 557abc40d121358883d2da8bc8bf976d6e8ec332 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Jun 2016 14:50:04 +0200 Subject: KVM: remove kvm_vcpu_compatible The new created_vcpus field makes it possible to avoid the race between irqchip and VCPU creation in a much nicer way; just check under kvm->lock whether a VCPU has already been created. We can then remove KVM_APIC_ARCHITECTURE too, because at this point the symbol is only governing the default definition of kvm_vcpu_compatible. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 1 - arch/x86/kvm/x86.c | 11 +++-------- include/linux/kvm_host.h | 6 ------ virt/kvm/Kconfig | 3 --- virt/kvm/kvm_main.c | 4 ---- 5 files changed, 3 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 639a6e34500c..ab8e32f7b9a8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -32,7 +32,6 @@ config KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD - select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf227212aebb..ab2f45a50bb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3774,7 +3774,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto split_irqchip_unlock; r = kvm_setup_empty_irq_routing(kvm); if (r) @@ -3839,7 +3839,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpic) goto create_irqchip_unlock; r = -EINVAL; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); @@ -3995,7 +3995,7 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) != 0) + if (kvm->created_vcpus) r = -EBUSY; else kvm->arch.bsp_vcpu_id = arg; @@ -7639,11 +7639,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) -{ - return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); -} - struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 63c6ab30bc81..0640ee92b978 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1105,12 +1105,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) #endif /* CONFIG_HAVE_KVM_EVENTFD */ -#ifdef CONFIG_KVM_APIC_ARCHITECTURE -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); -#else -static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } -#endif - static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e5d6108f5e85..b0cc1a34db27 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD bool select EVENTFD -config KVM_APIC_ARCHITECTURE - bool - config KVM_MMIO bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 15b757ae64e1..ef54b4c31792 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2368,10 +2368,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; mutex_lock(&kvm->lock); - if (!kvm_vcpu_compatible(vcpu)) { - r = -EINVAL; - goto unlock_vcpu_destroy; - } if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; -- cgit v1.2.3 From 8221a013528597edc18d371b22667e8eefca599b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 17 Jun 2016 14:43:34 -0500 Subject: PCI: Unify pci_resource_to_user() declarations Replace the pci_resource_to_user() declarations in each arch that defines HAVE_ARCH_PCI_RESOURCE_TO_USER with a single one in linux/pci.h. Change the MIPS static inline implementation to a non-inline version so the static inline doesn't conflict with the new non-static linux/pci.h declaration. No functional change intended. Signed-off-by: Bjorn Helgaas --- arch/microblaze/include/asm/pci.h | 3 --- arch/mips/include/asm/pci.h | 10 ---------- arch/mips/pci/pci.c | 10 ++++++++++ arch/powerpc/include/asm/pci.h | 3 --- arch/sparc/include/asm/pci_64.h | 3 --- include/linux/pci.h | 6 +++++- 6 files changed, 15 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index fc3ecb55f1b2..2a120bb70e54 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -82,9 +82,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); #define HAVE_ARCH_PCI_RESOURCE_TO_USER -extern void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 86b239d9d75d..9b63cd41213d 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -80,16 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, #define HAVE_ARCH_PCI_RESOURCE_TO_USER -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - phys_addr_t size = resource_size(rsrc); - - *start = fixup_bigphys_addr(rsrc->start, size); - *end = rsrc->start + size; -} - /* * Dynamic DMA mapping stuff. * MIPS has everything mapped statically. diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c index f1b11f0dea2d..5717384a986d 100644 --- a/arch/mips/pci/pci.c +++ b/arch/mips/pci/pci.c @@ -319,6 +319,16 @@ void pcibios_fixup_bus(struct pci_bus *bus) EXPORT_SYMBOL(PCIBIOS_MIN_IO); EXPORT_SYMBOL(PCIBIOS_MIN_MEM); +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, resource_size_t *start, + resource_size_t *end) +{ + phys_addr_t size = resource_size(rsrc); + + *start = fixup_bigphys_addr(rsrc->start, size); + *end = rsrc->start + size; +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index a6f3ac0d4602..e9bd6cf0212f 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -136,9 +136,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, pgprot_t prot); #define HAVE_ARCH_PCI_RESOURCE_TO_USER -extern void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h index 022d16008a00..2303635158f5 100644 --- a/arch/sparc/include/asm/pci_64.h +++ b/arch/sparc/include/asm/pci_64.h @@ -55,9 +55,6 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) } #define HAVE_ARCH_PCI_RESOURCE_TO_USER -void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, - resource_size_t *start, resource_size_t *end); #endif /* __KERNEL__ */ #endif /* __SPARC64_PCI_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..9c201d4dbd51 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1554,7 +1554,11 @@ static inline const char *pci_name(const struct pci_dev *pdev) /* Some archs don't want to expose struct resource to userland as-is * in sysfs and /proc */ -#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER +#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER +void pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end); +#else static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end) -- cgit v1.2.3 From 224abb67e6eb5ac062de9239163136d5ec3155c8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 17 Jun 2016 15:23:52 -0500 Subject: PCI: Document connection between pci_power_t and hardware PM capability The dev.pme_support field, pci_pm_init(), pci_pme_capable(), and pci_raw_set_power_state() depend on the fact that the pci_power_t values (PCI_D0, PCI_D1, etc.) match the definition of the Capabilities PME_Support and the Control/Status PowerState fields in the Power Management capability (see PCI Bus Power Management spec r1.2, sec 3.2.3). Add a note to this effect at the pci_power_t typedef. Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg --- include/linux/pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 8597b423cb63..0a1a9e30359c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -101,6 +101,10 @@ enum { DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES, }; +/* + * pci_power_t values must match the bits in the Capabilities PME_Support + * and Control/Status PowerState fields in the Power Management capability. + */ typedef int __bitwise pci_power_t; #define PCI_D0 ((pci_power_t __force) 0) -- cgit v1.2.3 From 4f920843d248946545415c1bf6120942048708ed Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Jun 2016 14:58:54 +0900 Subject: kconfig.h: use __is_defined() to check if MODULE is defined The macro MODULE is not a config option, it is a per-file build option. So, config_enabled(MODULE) is not sensible. (There is another case in include/linux/export.h, where config_enabled() is used against a non-config option.) This commit renames some macros in include/linux/kconfig.h for the use for non-config macros and replaces config_enabled(MODULE) with __is_defined(MODULE). I am keeping config_enabled() because it is still referenced from some places, but I expect it would be deprecated in the future. Signed-off-by: Masahiro Yamada Signed-off-by: Michal Marek --- include/linux/kconfig.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index b33c7797eb57..a94b5bf57f51 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -17,10 +17,11 @@ * the last step cherry picks the 2nd arg, we get a zero. */ #define __ARG_PLACEHOLDER_1 0, -#define config_enabled(cfg) _config_enabled(cfg) -#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value) -#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0) -#define ___config_enabled(__ignored, val, ...) val +#define config_enabled(cfg) ___is_defined(cfg) +#define __is_defined(x) ___is_defined(x) +#define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) +#define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) +#define __take_second_arg(__ignored, val, ...) val /* * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 @@ -42,7 +43,7 @@ * built-in code when CONFIG_FOO is set to 'm'. */ #define IS_REACHABLE(option) (config_enabled(option) || \ - (config_enabled(option##_MODULE) && config_enabled(MODULE))) + (config_enabled(option##_MODULE) && __is_defined(MODULE))) /* * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', -- cgit v1.2.3 From 6023d2369ba7b82b0588fd6fcdd558a6fef200ae Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Jun 2016 14:58:55 +0900 Subject: export.h: use __is_defined() to check if __KSYM_* is defined Here the need is for a macro that checks whether the given symbol is defined or not, which has nothing to do with config. The new macro __is_defined() is a better fit for this case. Signed-off-by: Masahiro Yamada Acked-by: Nicolas Pitre Signed-off-by: Michal Marek --- include/linux/export.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index 2f9ccbe6a639..c565f87f005e 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -82,7 +82,7 @@ extern struct module __this_module; #include #define __EXPORT_SYMBOL(sym, sec) \ - __cond_export_sym(sym, sec, config_enabled(__KSYM_##sym)) + __cond_export_sym(sym, sec, __is_defined(__KSYM_##sym)) #define __cond_export_sym(sym, sec, conf) \ ___cond_export_sym(sym, sec, conf) #define ___cond_export_sym(sym, sec, enabled) \ -- cgit v1.2.3 From 05a25c8e2c593914c18faf91dfb5ee471b79ce58 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Jun 2016 14:58:56 +0900 Subject: kconfig.h: use already defined macros for IS_REACHABLE() define For the same reason as commit 02d699f1f464 ("include/linux/kconfig.h: ese macros which are already defined"), it is better to use macros IS_BUILTIN() and IS_MODULE() for defining IS_REACHABLE(). Signed-off-by: Masahiro Yamada Signed-off-by: Michal Marek --- include/linux/kconfig.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index a94b5bf57f51..722c7d2c48d6 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -42,8 +42,8 @@ * This is similar to IS_ENABLED(), but returns false when invoked from * built-in code when CONFIG_FOO is set to 'm'. */ -#define IS_REACHABLE(option) (config_enabled(option) || \ - (config_enabled(option##_MODULE) && __is_defined(MODULE))) +#define IS_REACHABLE(option) (IS_BUILTIN(option) || \ + (IS_MODULE(option) && __is_defined(MODULE))) /* * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', -- cgit v1.2.3 From 5e8754fd80b0a594f720f44d32bf28c7b06ba5a6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Jun 2016 14:58:57 +0900 Subject: kconfig.h: allow to use IS_{ENABLE,REACHABLE} in macro expansion The typical usage of IS_ENABLED() is if (IS_ENABLED(CONFIG_FOO)) { ... } or #if IS_ENABLED(CONFIG_FOO) ... #endif The current implementation of IS_ENABLED() includes "||" operator, which works well in those expressions like above. However, there is a case where we want to evaluate a config option beyond those use cases. For example, the OF_TABLE() in include/asm-generic/vmlinux.lds.h needs to evaluate a config option in macro expansion: #define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name) #define __OF_TABLE(cfg, name) ___OF_TABLE(cfg, name) #define OF_TABLE(cfg, name) __OF_TABLE(config_enabled(cfg), name) #define _OF_TABLE_0(name) #define _OF_TABLE_1(name) \ ... Here, we can not use IS_ENABLED() because of the "||" operator in its define. It is true config_enabled() works well, but it is a bit ambiguous to be used against config options. This commit makes IS_ENABLED() available in more generic context by calculating "or" with macro expansion only. Do likewise for IS_REACHABLE(). Signed-off-by: Masahiro Yamada Signed-off-by: Michal Marek --- include/linux/kconfig.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 722c7d2c48d6..15ec117ec537 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -3,6 +3,21 @@ #include +#define __ARG_PLACEHOLDER_1 0, +#define __take_second_arg(__ignored, val, ...) val + +/* + * The use of "&&" / "||" is limited in certain expressions. + * The followings enable to calculate "and" / "or" with macro expansion only. + */ +#define __and(x, y) ___and(x, y) +#define ___and(x, y) ____and(__ARG_PLACEHOLDER_##x, y) +#define ____and(arg1_or_junk, y) __take_second_arg(arg1_or_junk y, 0) + +#define __or(x, y) ___or(x, y) +#define ___or(x, y) ____or(__ARG_PLACEHOLDER_##x, y) +#define ____or(arg1_or_junk, y) __take_second_arg(arg1_or_junk 1, y) + /* * Helper macros to use CONFIG_ options in C/CPP expressions. Note that * these only work with boolean and tristate options. @@ -16,12 +31,10 @@ * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when * the last step cherry picks the 2nd arg, we get a zero. */ -#define __ARG_PLACEHOLDER_1 0, #define config_enabled(cfg) ___is_defined(cfg) #define __is_defined(x) ___is_defined(x) #define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) #define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) -#define __take_second_arg(__ignored, val, ...) val /* * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 @@ -42,14 +55,13 @@ * This is similar to IS_ENABLED(), but returns false when invoked from * built-in code when CONFIG_FOO is set to 'm'. */ -#define IS_REACHABLE(option) (IS_BUILTIN(option) || \ - (IS_MODULE(option) && __is_defined(MODULE))) +#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \ + __and(IS_MODULE(option), __is_defined(MODULE))) /* * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', * 0 otherwise. */ -#define IS_ENABLED(option) \ - (IS_BUILTIN(option) || IS_MODULE(option)) +#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option)) #endif /* __LINUX_KCONFIG_H */ -- cgit v1.2.3 From df9b2b4a4aa49f874f8507680a533369e4b9c378 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 20 Jun 2016 12:09:41 +0200 Subject: mm/page_ref: introduce page_ref_inc_return Let's introduce that helper. Signed-off-by: David Hildenbrand Acked-by: Andrew Morton Signed-off-by: Christian Borntraeger --- include/linux/page_ref.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8b5e0a9f2431..610e13271918 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int page_ref_inc_return(struct page *page) +{ + int ret = atomic_inc_return(&page->_refcount); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + __page_ref_mod_and_return(page, 1, ret); + return ret; +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); -- cgit v1.2.3 From be9d2e8927cef02076bb7b5b2637cd9f4be2e8df Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 7 Jun 2016 09:44:01 +0200 Subject: PCI: Add helpers to request/release memory and I/O regions Add helpers to request and release a device's memory or I/O regions. With these helpers in place, one does not need to select a device's memory or I/O regions with pci_select_bars() prior to requesting or releasing them. Suggested-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig --- include/linux/pci.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 9c201d4dbd51..6af3b93f0710 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1402,6 +1402,34 @@ typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); void pci_register_set_vga_state(arch_set_vga_state_t func); +static inline int +pci_request_io_regions(struct pci_dev *pdev, const char *name) +{ + return pci_request_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_IO), name); +} + +static inline void +pci_release_io_regions(struct pci_dev *pdev) +{ + return pci_release_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_IO)); +} + +static inline int +pci_request_mem_regions(struct pci_dev *pdev, const char *name) +{ + return pci_request_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_MEM), name); +} + +static inline void +pci_release_mem_regions(struct pci_dev *pdev) +{ + return pci_release_selected_regions(pdev, + pci_select_bars(pdev, IORESOURCE_MEM)); +} + #else /* CONFIG_PCI is not enabled */ static inline void pci_set_flags(int flags) { } -- cgit v1.2.3 From 765bf9b739731b925ca26a8f05765b87f2bd9724 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Wed, 8 Jun 2016 12:04:47 +0100 Subject: PCI: Add generic pci_bus_claim_resources() All PCI resources (bridge windows and BARs) should be inserted in the iomem_resource and ioport_resource trees so we know what space is occupied and what is available for other devices. There's nothing arch-specific about this, but it is currently done by arch-specific code. Add a generic pci_bus_claim_resources() interface so we can migrate away from the arch-specific code. [bhelgaas: changelog] Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas CC: Arnd Bergmann CC: Yinghai Lu --- drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 69 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 55641a39a3e9..1d1a2c952c35 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1423,6 +1423,74 @@ void pci_bus_assign_resources(const struct pci_bus *bus) } EXPORT_SYMBOL(pci_bus_assign_resources); +static void pci_claim_device_resources(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->parent) + continue; + + pci_claim_resource(dev, i); + } +} + +static void pci_claim_bridge_resources(struct pci_dev *dev) +{ + int i; + + for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = &dev->resource[i]; + + if (!r->flags || r->parent) + continue; + + pci_claim_bridge_resource(dev, i); + } +} + +static void pci_bus_allocate_dev_resources(struct pci_bus *b) +{ + struct pci_dev *dev; + struct pci_bus *child; + + list_for_each_entry(dev, &b->devices, bus_list) { + pci_claim_device_resources(dev); + + child = dev->subordinate; + if (child) + pci_bus_allocate_dev_resources(child); + } +} + +static void pci_bus_allocate_resources(struct pci_bus *b) +{ + struct pci_bus *child; + + /* + * Carry out a depth-first search on the PCI bus + * tree to allocate bridge apertures. Read the + * programmed bridge bases and recursively claim + * the respective bridge resources. + */ + if (b->self) { + pci_read_bridge_bases(b); + pci_claim_bridge_resources(b->self); + } + + list_for_each_entry(child, &b->children, node) + pci_bus_allocate_resources(child); +} + +void pci_bus_claim_resources(struct pci_bus *b) +{ + pci_bus_allocate_resources(b); + pci_bus_allocate_dev_resources(b); +} +EXPORT_SYMBOL(pci_bus_claim_resources); + static void __pci_bridge_assign_resources(const struct pci_dev *bridge, struct list_head *add_head, struct list_head *fail_head) diff --git a/include/linux/pci.h b/include/linux/pci.h index 6af3b93f0710..d6cfecfee864 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1114,6 +1114,7 @@ int pci_set_vpd_size(struct pci_dev *dev, size_t len); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); void pci_bus_assign_resources(const struct pci_bus *bus); +void pci_bus_claim_resources(struct pci_bus *bus); void pci_bus_size_bridges(struct pci_bus *bus); int pci_claim_resource(struct pci_dev *, int); int pci_claim_bridge_resource(struct pci_dev *bridge, int i); -- cgit v1.2.3 From ebaac1736245e78109cd47d453a86a18dcfc94b8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2016 15:09:28 +0200 Subject: context_tracking: move rcu_virt_note_context_switch out of kvm_host.h Make kvm_guest_{enter,exit} and __kvm_guest_{enter,exit} trivial wrappers around the code in context_tracking.h. Name the context_tracking.h functions consistently with what those for kernel<->user switch. Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Reviewed-by: Rik van Riel Signed-off-by: Paolo Bonzini --- include/linux/context_tracking.h | 38 ++++++++++++++++++++++++++++++++++---- include/linux/kvm_host.h | 25 ++++--------------------- 2 files changed, 38 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d259274238db..ff4a32d24d56 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -84,7 +84,8 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static inline void guest_enter(void) +/* must be called with irqs disabled */ +static inline void guest_enter_irqoff(void) { if (vtime_accounting_cpu_enabled()) vtime_guest_enter(current); @@ -93,9 +94,19 @@ static inline void guest_enter(void) if (context_tracking_is_enabled()) __context_tracking_enter(CONTEXT_GUEST); + + /* KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspace from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + if (!context_tracking_cpu_is_enabled()) + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { if (context_tracking_is_enabled()) __context_tracking_exit(CONTEXT_GUEST); @@ -107,7 +118,7 @@ static inline void guest_exit(void) } #else -static inline void guest_enter(void) +static inline void guest_enter_irqoff(void) { /* * This is running in ioctl context so its safe @@ -116,9 +127,10 @@ static inline void guest_enter(void) */ vtime_account_system(current); current->flags |= PF_VCPU; + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { /* Flush the guest cputime we spent on the guest */ vtime_account_system(current); @@ -126,4 +138,22 @@ static inline void guest_exit(void) } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_enter_irqoff(); + local_irq_restore(flags); +} + +static inline void guest_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_exit_irqoff(); + local_irq_restore(flags); +} + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0640ee92b978..ffff40522688 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -878,40 +878,23 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, /* must be called with irqs disabled */ static inline void __kvm_guest_enter(void) { - guest_enter(); - /* KVM does not hold any references to rcu protected data when it - * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspace from rcu point of view. In - * addition CPU may stay in a guest mode for quite a long time (up to - * one time slice). Lets treat guest mode as quiescent state, just like - * we do with user-mode execution. - */ - if (!context_tracking_cpu_is_enabled()) - rcu_virt_note_context_switch(smp_processor_id()); + guest_enter_irqoff(); } /* must be called with irqs disabled */ static inline void __kvm_guest_exit(void) { - guest_exit(); + guest_exit_irqoff(); } static inline void kvm_guest_enter(void) { - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_enter(); - local_irq_restore(flags); + guest_enter(); } static inline void kvm_guest_exit(void) { - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_exit(); - local_irq_restore(flags); + guest_exit(); } /* -- cgit v1.2.3 From 6edaa5307f3f51e4e56dc4c63f68a69d88c6ddf5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2016 15:18:26 +0200 Subject: KVM: remove kvm_guest_enter/exit wrappers Use the functions from context_tracking.h directly. Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Reviewed-by: Rik van Riel Signed-off-by: Paolo Bonzini --- arch/arm/kvm/arm.c | 8 ++++---- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/kvm/book3s_pr.c | 4 ++-- arch/powerpc/kvm/booke.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 2 +- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/s390/kvm/vsie.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 22 ---------------------- 10 files changed, 19 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index f20ca84537f5..9ac4970882fe 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -615,7 +615,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - __kvm_guest_enter(); + guest_enter_irqoff(); vcpu->mode = IN_GUEST_MODE; ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); @@ -641,14 +641,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) local_irq_enable(); /* - * We do local_irq_enable() before calling kvm_guest_exit() so + * We do local_irq_enable() before calling guest_exit() so * that if a timer interrupt hits while running the guest we * account that tick as being spent in the guest. We enable - * preemption after calling kvm_guest_exit() so that if we get + * preemption after calling guest_exit() so that if we get * preempted we make sure ticks after that is not counted as * guest time. */ - kvm_guest_exit(); + guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5a2b9034a05c..5f1163653b50 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -406,7 +406,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - __kvm_guest_enter(); + guest_enter_irqoff(); /* Disable hardware page table walking while in guest */ htw_stop(); @@ -418,7 +418,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Re-enable HTW before enabling interrupts */ htw_start(); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); if (vcpu->sigset_active) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e20beae5ca7a..6b2859c12ae8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2522,7 +2522,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) spin_unlock(&pvc->lock); - kvm_guest_enter(); + guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2570,7 +2570,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); - kvm_guest_exit(); + guest_exit(); for (sub = 0; sub < core_info.n_subcores; ++sub) list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 8e4f64f0b774..6a66c5ff0827 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, /* We get here with MSR.EE=1 */ trace_kvm_exit(exit_nr, vcpu); - kvm_guest_exit(); + guest_exit(); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -1531,7 +1531,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_clear_debug(vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Make sure we save the guest FPU/Altivec/VSX state */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4afae695899a..02b4672f7347 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Switch back to user space debug context */ @@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } trace_kvm_exit(exit_nr, vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 02416fea7653..1ac036e45ed4 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - __kvm_guest_enter(); + guest_enter_irqoff(); return 1; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 03eeeb0ded24..d42428c11794 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2623,14 +2623,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * guest_enter and guest_exit should be no uaccess. */ local_irq_disable(); - __kvm_guest_enter(); + guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); local_irq_disable(); __enable_cpu_timer_accounting(vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 6895e7b3be12..c106488b4137 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -765,13 +765,13 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); local_irq_disable(); - kvm_guest_enter(); + guest_enter_irqoff(); local_irq_enable(); rc = sie64a(scb_s, vcpu->run->s.regs.gprs); local_irq_disable(); - kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e50e2ad6d08..618463abeec5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6658,7 +6658,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); - __kvm_guest_enter(); + guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6717,7 +6717,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ barrier(); - kvm_guest_exit(); + guest_exit(); preempt_enable(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ffff40522688..66b2f6159aad 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -875,28 +875,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -/* must be called with irqs disabled */ -static inline void __kvm_guest_enter(void) -{ - guest_enter_irqoff(); -} - -/* must be called with irqs disabled */ -static inline void __kvm_guest_exit(void) -{ - guest_exit_irqoff(); -} - -static inline void kvm_guest_enter(void) -{ - guest_enter(); -} - -static inline void kvm_guest_exit(void) -{ - guest_exit(); -} - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. -- cgit v1.2.3 From c63cf538eb4bf6a5ffd3750366d8d56f023645a5 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Tue, 12 Jul 2016 22:09:26 +0200 Subject: KVM: pass struct kvm to kvm_set_routing_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arch-specific code will use it. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/mpic.c | 3 ++- arch/s390/kvm/interrupt.c | 3 ++- arch/x86/kvm/irq_comm.c | 3 ++- include/linux/kvm_host.h | 3 ++- virt/kvm/irqchip.c | 7 ++++--- 5 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 6249cdc834d1..ed38f8114118 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return 0; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ca19627779db..24524c0f3ef8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2246,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int ret; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 47ad681a33fd..889563d50c55 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -248,7 +248,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 66b2f6159aad..60d339faa94c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1011,7 +1011,8 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 8db197bb6c7a..df99e9c3b64d 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm) free_irq_routing_table(rt); } -static int setup_routing_entry(struct kvm_irq_routing_table *rt, +static int setup_routing_entry(struct kvm *kvm, + struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { @@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, e->gsi = ue->gsi; e->type = ue->type; - r = kvm_set_routing_entry(e, ue); + r = kvm_set_routing_entry(kvm, e, ue); if (r) goto out; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) @@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm, kfree(e); goto out; } - r = setup_routing_entry(new, e, ue); + r = setup_routing_entry(kvm, new, e, ue); if (r) { kfree(e); goto out; -- cgit v1.2.3 From 8a39d00670f0792c1186e442e1dd28fe0326f2ee Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:26 +0100 Subject: KVM: kvm_io_bus: Add kvm_io_bus_get_dev() call The kvm_io_bus framework is a nice place of holding information about various MMIO regions for kernel emulated devices. Add a call to retrieve the kvm_io_device structure which is associated with a certain MMIO address. This avoids to duplicate kvm_io_bus' knowledge of MMIO regions without having to fake MMIO calls if a user needs the device a certain MMIO address belongs to. This will be used by the ITS emulation to get the associated ITS device when someone triggers an MSI via an ioctl from userspace. Signed-off-by: Andre Przywara Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Acked-by: Christoffer Dall Acked-by: Paolo Bonzini Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0640ee92b978..614a98137c5f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); +struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr); #ifdef CONFIG_KVM_ASYNC_PF struct kvm_async_pf { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ef54b4c31792..bd2eb92c5d0e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3496,6 +3496,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, return r; } +struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr) +{ + struct kvm_io_bus *bus; + int dev_idx, srcu_idx; + struct kvm_io_device *iodev = NULL; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + + dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); + if (dev_idx < 0) + goto out_unlock; + + iodev = bus->range[dev_idx].dev; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return iodev; +} +EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); + static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, }; -- cgit v1.2.3 From 645b9e49a8c053182aae0765d797f557f7a67eda Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:28 +0100 Subject: irqchip/gic-v3: Refactor and add GICv3 definitions arm-gic-v3.h contains bit and register definitions for the GICv3 and ITS, at least for the bits the we currently care about. The ITS emulation needs more definitions, so add them and refactor the memory attribute #defines to be more universally usable. To avoid changing all users, we still provide some of the old definitons defined with the help of the new macros. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 180 ++++++++++++++++++++++++------------- 1 file changed, 120 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index bfbd707de390..9442be7f2461 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -112,34 +112,60 @@ #define GICR_WAKER_ProcessorSleep (1U << 1) #define GICR_WAKER_ChildrenAsleep (1U << 2) -#define GICR_PROPBASER_NonShareable (0U << 10) -#define GICR_PROPBASER_InnerShareable (1U << 10) -#define GICR_PROPBASER_OuterShareable (2U << 10) -#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10) -#define GICR_PROPBASER_nCnB (0U << 7) -#define GICR_PROPBASER_nC (1U << 7) -#define GICR_PROPBASER_RaWt (2U << 7) -#define GICR_PROPBASER_RaWb (3U << 7) -#define GICR_PROPBASER_WaWt (4U << 7) -#define GICR_PROPBASER_WaWb (5U << 7) -#define GICR_PROPBASER_RaWaWt (6U << 7) -#define GICR_PROPBASER_RaWaWb (7U << 7) -#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7) -#define GICR_PROPBASER_IDBITS_MASK (0x1f) - -#define GICR_PENDBASER_NonShareable (0U << 10) -#define GICR_PENDBASER_InnerShareable (1U << 10) -#define GICR_PENDBASER_OuterShareable (2U << 10) -#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10) -#define GICR_PENDBASER_nCnB (0U << 7) -#define GICR_PENDBASER_nC (1U << 7) -#define GICR_PENDBASER_RaWt (2U << 7) -#define GICR_PENDBASER_RaWb (3U << 7) -#define GICR_PENDBASER_WaWt (4U << 7) -#define GICR_PENDBASER_WaWb (5U << 7) -#define GICR_PENDBASER_RaWaWt (6U << 7) -#define GICR_PENDBASER_RaWaWb (7U << 7) -#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7) +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL + +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_IDBITS_MASK (0x1f) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_PTZ BIT_ULL(62) /* * Re-Distributor registers, offsets from SGI_base @@ -175,59 +201,74 @@ #define GITS_CWRITER 0x0088 #define GITS_CREADR 0x0090 #define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 #define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc #define GITS_TRANSLATER 0x10040 #define GITS_CTLR_ENABLE (1U << 0) #define GITS_CTLR_QUIESCENT (1U << 31) +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) #define GITS_TYPER_PTA (1UL << 19) - -#define GITS_CBASER_VALID (1UL << 63) -#define GITS_CBASER_nCnB (0UL << 59) -#define GITS_CBASER_nC (1UL << 59) -#define GITS_CBASER_RaWt (2UL << 59) -#define GITS_CBASER_RaWb (3UL << 59) -#define GITS_CBASER_WaWt (4UL << 59) -#define GITS_CBASER_WaWb (5UL << 59) -#define GITS_CBASER_RaWaWt (6UL << 59) -#define GITS_CBASER_RaWaWb (7UL << 59) -#define GITS_CBASER_CACHEABILITY_MASK (7UL << 59) -#define GITS_CBASER_NonShareable (0UL << 10) -#define GITS_CBASER_InnerShareable (1UL << 10) -#define GITS_CBASER_OuterShareable (2UL << 10) -#define GITS_CBASER_SHAREABILITY_MASK (3UL << 10) +#define GITS_TYPER_HWCOLLCNT_SHIFT 24 + +#define GITS_CBASER_VALID (1UL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK + +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) #define GITS_BASER_NR_REGS 8 -#define GITS_BASER_VALID (1UL << 63) -#define GITS_BASER_nCnB (0UL << 59) -#define GITS_BASER_nC (1UL << 59) -#define GITS_BASER_RaWt (2UL << 59) -#define GITS_BASER_RaWb (3UL << 59) -#define GITS_BASER_WaWt (4UL << 59) -#define GITS_BASER_WaWb (5UL << 59) -#define GITS_BASER_RaWaWt (6UL << 59) -#define GITS_BASER_RaWaWb (7UL << 59) -#define GITS_BASER_CACHEABILITY_MASK (7UL << 59) -#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_VALID (1UL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) -#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) #define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1) -#define GITS_BASER_NonShareable (0UL << 10) -#define GITS_BASER_InnerShareable (1UL << 10) -#define GITS_BASER_OuterShareable (2UL << 10) #define GITS_BASER_SHAREABILITY_SHIFT (10) -#define GITS_BASER_SHAREABILITY_MASK (3UL << GITS_BASER_SHAREABILITY_SHIFT) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) #define GITS_BASER_PAGE_SIZE_SHIFT (8) #define GITS_BASER_PAGE_SIZE_4K (0UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_16K (1UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_64K (2UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_MASK (3UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGES_MAX 256 +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) #define GITS_BASER_TYPE_NONE 0 #define GITS_BASER_TYPE_DEVICE 1 @@ -243,7 +284,10 @@ */ #define GITS_CMD_MAPD 0x08 #define GITS_CMD_MAPC 0x09 -#define GITS_CMD_MAPVI 0x0a +#define GITS_CMD_MAPTI 0x0a +/* older GIC documentation used MAPVI for this command */ +#define GITS_CMD_MAPVI GITS_CMD_MAPTI +#define GITS_CMD_MAPI 0x0b #define GITS_CMD_MOVI 0x01 #define GITS_CMD_DISCARD 0x0f #define GITS_CMD_INV 0x0c @@ -253,6 +297,22 @@ #define GITS_CMD_CLEAR 0x04 #define GITS_CMD_SYNC 0x05 +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + /* * CPU interface registers */ -- cgit v1.2.3 From 8c828a535e29f50282f1a49a52c3b20ccaa039aa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 18 Jul 2016 15:28:52 +0100 Subject: irqchip/gicv3-its: Restore all cacheability attributes Let's restore some of the #defines that have been savagely dropped by the introduction of the KVM ITS code, as pointlessly break other users (including series that are already in -next). Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 48 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 9442be7f2461..700b4216c87a 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -146,8 +146,16 @@ #define GICR_PROPBASER_InnerShareable \ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) -#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) -#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + #define GICR_PROPBASER_IDBITS_MASK (0x1f) #define GICR_PENDBASER_SHAREABILITY_SHIFT (10) @@ -163,8 +171,16 @@ #define GICR_PENDBASER_InnerShareable \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) -#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) -#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + #define GICR_PENDBASER_PTZ BIT_ULL(62) /* @@ -237,24 +253,40 @@ #define GITS_CBASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) -#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) -#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) + +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) #define GITS_BASER_NR_REGS 8 #define GITS_BASER_VALID (1UL << 63) #define GITS_BASER_INDIRECT (1ULL << 62) + #define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) #define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) #define GITS_BASER_INNER_CACHEABILITY_MASK \ GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK #define GITS_BASER_OUTER_CACHEABILITY_MASK \ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) #define GITS_BASER_SHAREABILITY_MASK \ GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) -#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) -#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + #define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) -- cgit v1.2.3 From aff171641d181ea573380efc3f559c9de4741fc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:17 +0900 Subject: PCI: Provide sensible IRQ vector alloc/free routines Add a function to allocate and free a range of interrupt vectors, using MSI-X, MSI or legacy vectors (in that order) based on the capabilities of the underlying device and PCIe complex. Additionally a new helper is provided to get the Linux IRQ number for given device-relative vector so that the drivers don't need to allocate their own arrays to keep track of the vectors for the multi vector MSI-X case. Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- Documentation/PCI/MSI-HOWTO.txt | 467 +++++++--------------------------------- drivers/pci/msi.c | 89 ++++++++ include/linux/pci.h | 27 +++ 3 files changed, 192 insertions(+), 391 deletions(-) (limited to 'include/linux') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 1179850f453c..0ac612b8c3fb 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -78,422 +78,107 @@ CONFIG_PCI_MSI option. 4.2 Using MSI -Most of the hard work is done for the driver in the PCI layer. It simply -has to request that the PCI layer set up the MSI capability for this +Most of the hard work is done for the driver in the PCI layer. The driver +simply has to request that the PCI layer set up the MSI capability for this device. -4.2.1 pci_enable_msi +To automatically use MSI or MSI-X interrupt vectors, use the following +function: -int pci_enable_msi(struct pci_dev *dev) + int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); -A successful call allocates ONE interrupt to the device, regardless -of how many MSIs the device supports. The device is switched from -pin-based interrupt mode to MSI mode. The dev->irq number is changed -to a new number which represents the message signaled interrupt; -consequently, this function should be called before the driver calls -request_irq(), because an MSI is delivered via a vector that is -different from the vector of a pin-based interrupt. +which allocates up to max_vecs interrupt vectors for a PCI device. It +returns the number of vectors allocated or a negative error. If the device +has a requirements for a minimum number of vectors the driver can pass a +min_vecs argument set to this limit, and the PCI core will return -ENOSPC +if it can't meet the minimum number of vectors. -4.2.2 pci_enable_msi_range +The flags argument should normally be set to 0, but can be used to pass the +PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support +MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in +case the device does not support legacy interrupt lines. -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +To get the Linux IRQ numbers passed to request_irq() and free_irq() and the +vectors, use the following function: -This function allows a device driver to request any number of MSI -interrupts within specified range from 'minvec' to 'maxvec'. + int pci_irq_vector(struct pci_dev *dev, unsigned int nr); -If this function returns a positive number it indicates the number of -MSI interrupts that have been successfully allocated. In this case -the device is switched from pin-based interrupt mode to MSI mode and -updates dev->irq to be the lowest of the new interrupts assigned to it. -The other interrupts assigned to the device are in the range dev->irq -to dev->irq + returned value - 1. Device driver can use the returned -number of successfully allocated MSI interrupts to further allocate -and initialize device resources. +Any allocated resources should be freed before removing the device using +the following function: -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. + void pci_free_irq_vectors(struct pci_dev *dev); -This function should be called before the driver calls request_irq(), -because MSI interrupts are delivered via vectors that are different -from the vector of a pin-based interrupt. +If a device supports both MSI-X and MSI capabilities, this API will use the +MSI-X facilities in preference to the MSI facilities. MSI-X supports any +number of interrupts between 1 and 2048. In contrast, MSI is restricted to +a maximum of 32 interrupts (and must be a power of two). In addition, the +MSI interrupt vectors must be allocated consecutively, so the system might +not be able to allocate as many vectors for MSI as it could for MSI-X. On +some platforms, MSI interrupts must all be targeted at the same set of CPUs +whereas MSI-X interrupts can all be targeted at different CPUs. -It is ideal if drivers can cope with a variable number of MSI interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. +If a device supports neither MSI-X or MSI it will fall back to a single +legacy IRQ vector. -There could be devices that can not operate with just any number of MSI -interrupts within a range. See chapter 4.3.1.3 to get the idea how to -handle such devices for MSI-X - the same logic applies to MSI. +The typical usage of MSI or MSI-X interrupts is to allocate as many vectors +as possible, likely up to the limit supported by the device. If nvec is +larger than the number supported by the device it will automatically be +capped to the supported limit, so there is no need to query the number of +vectors supported beforehand: -4.2.1.1 Maximum possible number of MSI interrupts - -The typical usage of MSI interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msi_vec_count() function: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.2.1.2 Exact number of MSI interrupts + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, 0); + if (nvec < 0) + goto out_err; If a driver is unable or unwilling to deal with a variable number of MSI -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msi_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) -{ - return pci_enable_msi_range(pdev, nvec, nvec); -} - -Note, unlike pci_enable_msi_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msi_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msi_exact() does). - -4.2.1.3 Single MSI mode - -The most notorious example of the request type described above is -enabling the single MSI mode for a device. It could be done by passing -two 1s as 'minvec' and 'maxvec': - -static int foo_driver_enable_single_msi(struct pci_dev *pdev) -{ - return pci_enable_msi_range(pdev, 1, 1); -} - -Note, unlike pci_enable_msi() function, which could be also used to -enable the single MSI mode, pci_enable_msi_range() returns either a -negative errno or 1 (not negative errno or 0 - as pci_enable_msi() -does). - -4.2.3 pci_enable_msi_exact - -int pci_enable_msi_exact(struct pci_dev *dev, int nvec) - -This variation on pci_enable_msi_range() call allows a device driver to -request exactly 'nvec' MSIs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to request any more MSI interrupts for -this device. - -By contrast with pci_enable_msi_range() function, pci_enable_msi_exact() -returns zero in case of success, which indicates MSI interrupts have been -successfully allocated. - -4.2.4 pci_disable_msi - -void pci_disable_msi(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msi_range(). -Calling it restores dev->irq to the pin-based interrupt number and frees -the previously allocated MSIs. The interrupts may subsequently be assigned -to another device, so drivers should not cache the value of dev->irq. - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI enabled and thus leaking its vector. - -4.2.4 pci_msi_vec_count - -int pci_msi_vec_count(struct pci_dev *dev) - -This function could be used to retrieve the number of MSI vectors the -device requested (via the Multiple Message Capable register). The MSI -specification only allows the returned value to be a power of two, -up to a maximum of 2^5 (32). - -If this function returns a negative number, it indicates the device is -not capable of sending MSIs. - -If this function returns a positive number, it indicates the maximum -number of MSI interrupt vectors that could be allocated. - -4.3 Using MSI-X - -The MSI-X capability is much more flexible than the MSI capability. -It supports up to 2048 interrupts, each of which can be controlled -independently. To support this flexibility, drivers must use an array of -`struct msix_entry': - -struct msix_entry { - u16 vector; /* kernel uses to write alloc vector */ - u16 entry; /* driver uses to specify entry */ -}; - -This allows for the device to use these interrupts in a sparse fashion; -for example, it could use interrupts 3 and 1027 and yet allocate only a -two-element array. The driver is expected to fill in the 'entry' value -in each element of the array to indicate for which entries the kernel -should assign interrupts; it is invalid to fill in two entries with the -same number. - -4.3.1 pci_enable_msix_range - -int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) - -Calling this function asks the PCI subsystem to allocate any number of -MSI-X interrupts within specified range from 'minvec' to 'maxvec'. -The 'entries' argument is a pointer to an array of msix_entry structs -which should be at least 'maxvec' entries in size. - -On success, the device is switched into MSI-X mode and the function -returns the number of MSI-X interrupts that have been successfully -allocated. In this case the 'vector' member in entries numbered from -0 to the returned value - 1 is populated with the interrupt number; -the driver should then call request_irq() for each 'vector' that it -decides to use. The device driver is responsible for keeping track of the -interrupts assigned to the MSI-X vectors so it can free them again later. -Device driver can use the returned number of successfully allocated MSI-X -interrupts to further allocate and initialize device resources. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -This function, in contrast with pci_enable_msi_range(), does not adjust -dev->irq. The device will not generate interrupts for this interrupt -number once MSI-X is enabled. - -Device drivers should normally call this function once per device -during the initialization phase. - -It is ideal if drivers can cope with a variable number of MSI-X interrupts; -there are many reasons why the platform may not be able to provide the -exact number that a driver asks for. - -There could be devices that can not operate with just any number of MSI-X -interrupts within a range. E.g., an network adapter might need let's say -four vectors per each queue it provides. Therefore, a number of MSI-X -interrupts allocated should be a multiple of four. In this case interface -pci_enable_msix_range() can not be used alone to request MSI-X interrupts -(since it can allocate any number within the range, without any notion of -the multiple of four) and the device driver should master a custom logic -to request the required number of MSI-X interrupts. - -4.3.1.1 Maximum possible number of MSI-X interrupts - -The typical usage of MSI-X interrupts is to allocate as many vectors as -possible, likely up to the limit returned by pci_msix_vec_count() function: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - 1, nvec); -} - -Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive, -the value of 0 would be meaningless and could result in error. - -Some devices have a minimal limit on number of MSI-X interrupts. -In this case the function could look like this: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - FOO_DRIVER_MINIMUM_NVEC, nvec); -} - -4.3.1.2 Exact number of MSI-X interrupts - -If a driver is unable or unwilling to deal with a variable number of MSI-X -interrupts it could request a particular number of interrupts by passing -that number to pci_enable_msix_range() function as both 'minvec' and 'maxvec' -parameters: - -static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) -{ - return pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - nvec, nvec); -} - -Note, unlike pci_enable_msix_exact() function, which could be also used to -enable a particular number of MSI-X interrupts, pci_enable_msix_range() -returns either a negative errno or 'nvec' (not negative errno or 0 - as -pci_enable_msix_exact() does). - -4.3.1.3 Specific requirements to the number of MSI-X interrupts - -As noted above, there could be devices that can not operate with just any -number of MSI-X interrupts within a range. E.g., let's assume a device that -is only capable sending the number of MSI-X interrupts which is a power of -two. A routine that enables MSI-X mode for such device might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_range(adapter->pdev, adapter->msix_entries, - maxvec, maxvec); - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } - - return rc; -} - -Note how pci_enable_msix_range() return value is analyzed for a fallback - -any error code other than -ENOSPC indicates a fatal error and should not -be retried. - -4.3.2 pci_enable_msix_exact - -int pci_enable_msix_exact(struct pci_dev *dev, - struct msix_entry *entries, int nvec) - -This variation on pci_enable_msix_range() call allows a device driver to -request exactly 'nvec' MSI-Xs. - -If this function returns a negative number, it indicates an error and -the driver should not attempt to allocate any more MSI-X interrupts for -this device. - -By contrast with pci_enable_msix_range() function, pci_enable_msix_exact() -returns zero in case of success, which indicates MSI-X interrupts have been -successfully allocated. - -Another version of a routine that enables MSI-X mode for a device with -specific requirements described in chapter 4.3.1.3 might look like this: - -/* - * Assume 'minvec' and 'maxvec' are non-zero - */ -static int foo_driver_enable_msix(struct foo_adapter *adapter, - int minvec, int maxvec) -{ - int rc; - - minvec = roundup_pow_of_two(minvec); - maxvec = rounddown_pow_of_two(maxvec); - - if (minvec > maxvec) - return -ERANGE; - -retry: - rc = pci_enable_msix_exact(adapter->pdev, - adapter->msix_entries, maxvec); - - /* - * -ENOSPC is the only error code allowed to be analyzed - */ - if (rc == -ENOSPC) { - if (maxvec == 1) - return -ENOSPC; - - maxvec /= 2; - - if (minvec > maxvec) - return -ENOSPC; - - goto retry; - } else if (rc < 0) { - return rc; - } - - return maxvec; -} - -4.3.3 pci_disable_msix - -void pci_disable_msix(struct pci_dev *dev) - -This function should be used to undo the effect of pci_enable_msix_range(). -It frees the previously allocated MSI-X interrupts. The interrupts may -subsequently be assigned to another device, so drivers should not cache -the value of the 'vector' elements over a call to pci_disable_msix(). - -Before calling this function, a device driver must always call free_irq() -on any interrupt for which it previously called request_irq(). -Failure to do so results in a BUG_ON(), leaving the device with -MSI-X enabled and thus leaking its vector. - -4.3.3 The MSI-X Table - -The MSI-X capability specifies a BAR and offset within that BAR for the -MSI-X Table. This address is mapped by the PCI subsystem, and should not -be accessed directly by the device driver. If the driver wishes to -mask or unmask an interrupt, it should call disable_irq() / enable_irq(). +interrupts it can request a particular number of interrupts by passing that +number to pci_alloc_irq_vectors() function as both 'min_vecs' and +'max_vecs' parameters: -4.3.4 pci_msix_vec_count + ret = pci_alloc_irq_vectors(pdev, nvec, nvec, 0); + if (ret < 0) + goto out_err; -int pci_msix_vec_count(struct pci_dev *dev) +The most notorious example of the request type described above is enabling +the single MSI mode for a device. It could be done by passing two 1s as +'min_vecs' and 'max_vecs': -This function could be used to retrieve number of entries in the device -MSI-X table. + ret = pci_alloc_irq_vectors(pdev, 1, 1, 0); + if (ret < 0) + goto out_err; -If this function returns a negative number, it indicates the device is -not capable of sending MSI-Xs. +Some devices might not support using legacy line interrupts, in which case +the PCI_IRQ_NOLEGACY flag can be used to fail the request if the platform +can't provide MSI or MSI-X interrupts: -If this function returns a positive number, it indicates the maximum -number of MSI-X interrupt vectors that could be allocated. + nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_NOLEGACY); + if (nvec < 0) + goto out_err; -4.4 Handling devices implementing both MSI and MSI-X capabilities +4.3 Legacy APIs -If a device implements both MSI and MSI-X capabilities, it can -run in either MSI mode or MSI-X mode, but not both simultaneously. -This is a requirement of the PCI spec, and it is enforced by the -PCI layer. Calling pci_enable_msi_range() when MSI-X is already -enabled or pci_enable_msix_range() when MSI is already enabled -results in an error. If a device driver wishes to switch between MSI -and MSI-X at runtime, it must first quiesce the device, then switch -it back to pin-interrupt mode, before calling pci_enable_msi_range() -or pci_enable_msix_range() and resuming operation. This is not expected -to be a common operation but may be useful for debugging or testing -during development. +The following old APIs to enable and disable MSI or MSI-X interrupts should +not be used in new code: -4.5 Considerations when using MSIs + pci_enable_msi() /* deprecated */ + pci_enable_msi_range() /* deprecated */ + pci_enable_msi_exact() /* deprecated */ + pci_disable_msi() /* deprecated */ + pci_enable_msix_range() /* deprecated */ + pci_enable_msix_exact() /* deprecated */ + pci_disable_msix() /* deprecated */ -4.5.1 Choosing between MSI-X and MSI +Additionally there are APIs to provide the number of supported MSI or MSI-X +vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these +should be avoided in favor of letting pci_alloc_irq_vectors() cap the +number of vectors. If you have a legitimate special use case for the count +of vectors we might have to revisit that decision and add a +pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently. -If your device supports both MSI-X and MSI capabilities, you should use -the MSI-X facilities in preference to the MSI facilities. As mentioned -above, MSI-X supports any number of interrupts between 1 and 2048. -In contrast, MSI is restricted to a maximum of 32 interrupts (and -must be a power of two). In addition, the MSI interrupt vectors must -be allocated consecutively, so the system might not be able to allocate -as many vectors for MSI as it could for MSI-X. On some platforms, MSI -interrupts must all be targeted at the same set of CPUs whereas MSI-X -interrupts can all be targeted at different CPUs. +4.4 Considerations when using MSIs -4.5.2 Spinlocks +4.4.1 Spinlocks Most device drivers have a per-device spinlock which is taken in the interrupt handler. With pin-based interrupts or a single MSI, it is not @@ -505,7 +190,7 @@ acquire the spinlock. Such deadlocks can be avoided by using spin_lock_irqsave() or spin_lock_irq() which disable local interrupts and acquire the lock (see Documentation/DocBook/kernel-locking). -4.6 How to tell whether MSI/MSI-X is enabled on a device +4.5 How to tell whether MSI/MSI-X is enabled on a device Using 'lspci -v' (as root) may show some devices with "MSI", "Message Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 98ace67c5f4d..5e5ab478ea7d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -4,6 +4,7 @@ * * Copyright (C) 2003-2004 Intel * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) + * Copyright (C) 2016 Christoph Hellwig. */ #include @@ -1121,6 +1122,94 @@ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, } EXPORT_SYMBOL(pci_enable_msix_range); +/** + * pci_alloc_irq_vectors - allocate multiple IRQs for a device + * @dev: PCI device to operate on + * @min_vecs: minimum number of vectors required (must be >= 1) + * @max_vecs: maximum (desired) number of vectors + * @flags: flags or quirks for the allocation + * + * Allocate up to @max_vecs interrupt vectors for @dev, using MSI-X or MSI + * vectors if available, and fall back to a single legacy vector + * if neither is available. Return the number of vectors allocated, + * (which might be smaller than @max_vecs) if successful, or a negative + * error code on error. If less than @min_vecs interrupt vectors are + * available for @dev the function will fail with -ENOSPC. + * + * To get the Linux IRQ number used for a vector that can be passed to + * request_irq() use the pci_irq_vector() helper. + */ +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + int vecs = -ENOSPC; + + if (!(flags & PCI_IRQ_NOMSIX)) { + vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); + if (vecs > 0) + return vecs; + } + + if (!(flags & PCI_IRQ_NOMSI)) { + vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); + if (vecs > 0) + return vecs; + } + + /* use legacy irq if allowed */ + if (!(flags & PCI_IRQ_NOLEGACY) && min_vecs == 1) + return 1; + return vecs; +} +EXPORT_SYMBOL(pci_alloc_irq_vectors); + +/** + * pci_free_irq_vectors - free previously allocated IRQs for a device + * @dev: PCI device to operate on + * + * Undoes the allocations and enabling in pci_alloc_irq_vectors(). + */ +void pci_free_irq_vectors(struct pci_dev *dev) +{ + pci_disable_msix(dev); + pci_disable_msi(dev); +} +EXPORT_SYMBOL(pci_free_irq_vectors); + +/** + * pci_irq_vector - return Linux IRQ number of a device vector + * @dev: PCI device to operate on + * @nr: device-relative interrupt vector index (0-based). + */ +int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (dev->msix_enabled) { + struct msi_desc *entry; + int i = 0; + + for_each_pci_msi_entry(entry, dev) { + if (i == nr) + return entry->irq; + i++; + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + if (dev->msi_enabled) { + struct msi_desc *entry = first_pci_msi_entry(dev); + + if (WARN_ON_ONCE(nr >= entry->nvec_used)) + return -EINVAL; + } else { + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + } + + return dev->irq + nr; +} +EXPORT_SYMBOL(pci_irq_vector); + struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) { return to_pci_dev(desc->dev); diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..52ecd49e8049 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1237,6 +1237,10 @@ resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno); int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); +#define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ +#define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ +#define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ + /* kmem_cache style wrapper around pci_alloc_consistent() */ #include @@ -1284,6 +1288,11 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev, return rc; return 0; } +int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags); +void pci_free_irq_vectors(struct pci_dev *dev); +int pci_irq_vector(struct pci_dev *dev, unsigned int nr); + #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline void pci_msi_shutdown(struct pci_dev *dev) { } @@ -1307,6 +1316,24 @@ static inline int pci_enable_msix_range(struct pci_dev *dev, static inline int pci_enable_msix_exact(struct pci_dev *dev, struct msix_entry *entries, int nvec) { return -ENOSYS; } +static inline int pci_alloc_irq_vectors(struct pci_dev *dev, + unsigned int min_vecs, unsigned int max_vecs, + unsigned int flags) +{ + if (min_vecs > 1) + return -EINVAL; + return 1; +} +static inline void pci_free_irq_vectors(struct pci_dev *dev) +{ +} + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + if (WARN_ON_ONCE(nr > 0)) + return -EINVAL; + return dev->irq; +} #endif #ifdef CONFIG_PCIEPORTBUS -- cgit v1.2.3 From 4ef33685aa0957d771e068b60a5f3ca6b47ade1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jul 2016 18:20:18 +0900 Subject: PCI: Spread interrupt vectors in pci_alloc_irq_vectors() Set the affinity_mask in the PCI device before allocating vectors so that the affinity can be propagated through the MSI descriptor structures to the core IRQ code. To facilitate this, new __pci_enable_msi_range() and __pci_enable_msix_range() helpers are factored out of their not prefixed variants which assigning the new IRQ affinity mask in the PCI device so that the low-level interrupt code can perform the interrupt affinity assignment and do node-local allocations. A new PCI_IRQ_NOAFFINITY flag is added to pci_alloc_irq_vectors() so that this function can also be used by drivers that don't wish to use the automatic affinity assignment. [bhelgaas: omit "else" after "return" consistently] Signed-off-by: Christoph Hellwig Signed-off-by: Bjorn Helgaas Reviewed-by: Alexander Gordeev --- Documentation/PCI/MSI-HOWTO.txt | 4 ++ drivers/pci/msi.c | 134 ++++++++++++++++++++++++++-------------- include/linux/pci.h | 2 + 3 files changed, 95 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 0ac612b8c3fb..c55df2911136 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -99,6 +99,10 @@ PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in case the device does not support legacy interrupt lines. +By default this function will spread the interrupts around the available +CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY +flag. + To get the Linux IRQ numbers passed to request_irq() and free_irq() and the vectors, use the following function: diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 5e5ab478ea7d..a02981efdad5 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -569,6 +569,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec) entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); entry->nvec_used = nvec; + entry->affinity = dev->irq_affinity; if (control & PCI_MSI_FLAGS_64BIT) entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; @@ -680,10 +681,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries) static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct msix_entry *entries, int nvec) { + const struct cpumask *mask = NULL; struct msi_desc *entry; - int i; + int cpu = -1, i; for (i = 0; i < nvec; i++) { + if (dev->irq_affinity) { + cpu = cpumask_next(cpu, dev->irq_affinity); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(dev->irq_affinity); + mask = cpumask_of(cpu); + } + entry = alloc_msi_entry(&dev->dev); if (!entry) { if (!i) @@ -703,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; entry->nvec_used = 1; + entry->affinity = mask; list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); } @@ -1028,19 +1038,8 @@ int pci_msi_enabled(void) } EXPORT_SYMBOL(pci_msi_enabled); -/** - * pci_enable_msi_range - configure device's MSI capability structure - * @dev: device to configure - * @minvec: minimal number of interrupts to configure - * @maxvec: maximum number of interrupts to configure - * - * This function tries to allocate a maximum possible number of interrupts in a - * range between @minvec and @maxvec. It returns a negative errno if an error - * occurs. If it succeeds, it returns the actual number of interrupts allocated - * and updates the @dev's irq member to the lowest new interrupt number; - * the other interrupt numbers allocated to this device are consecutive. - **/ -int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, + unsigned int flags) { int nvec; int rc; @@ -1063,25 +1062,85 @@ int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) nvec = pci_msi_vec_count(dev); if (nvec < 0) return nvec; - else if (nvec < minvec) + if (nvec < minvec) return -EINVAL; - else if (nvec > maxvec) + + if (nvec > maxvec) nvec = maxvec; - do { + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) + return -ENOSPC; + } + rc = msi_capability_init(dev, nvec); - if (rc < 0) { + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) return rc; - } else if (rc > 0) { - if (rc < minvec) + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } +} + +/** + * pci_enable_msi_range - configure device's MSI capability structure + * @dev: device to configure + * @minvec: minimal number of interrupts to configure + * @maxvec: maximum number of interrupts to configure + * + * This function tries to allocate a maximum possible number of interrupts in a + * range between @minvec and @maxvec. It returns a negative errno if an error + * occurs. If it succeeds, it returns the actual number of interrupts allocated + * and updates the @dev's irq member to the lowest new interrupt number; + * the other interrupt numbers allocated to this device are consecutive. + **/ +int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) +{ + return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY); +} +EXPORT_SYMBOL(pci_enable_msi_range); + +static int __pci_enable_msix_range(struct pci_dev *dev, + struct msix_entry *entries, int minvec, int maxvec, + unsigned int flags) +{ + int nvec = maxvec; + int rc; + + if (maxvec < minvec) + return -ERANGE; + + for (;;) { + if (!(flags & PCI_IRQ_NOAFFINITY)) { + dev->irq_affinity = irq_create_affinity_mask(&nvec); + if (nvec < minvec) return -ENOSPC; - nvec = rc; } - } while (rc); - return nvec; + rc = pci_enable_msix(dev, entries, nvec); + if (rc == 0) + return nvec; + + kfree(dev->irq_affinity); + dev->irq_affinity = NULL; + + if (rc < 0) + return rc; + if (rc < minvec) + return -ENOSPC; + + nvec = rc; + } } -EXPORT_SYMBOL(pci_enable_msi_range); /** * pci_enable_msix_range - configure device's MSI-X capability structure @@ -1099,26 +1158,10 @@ EXPORT_SYMBOL(pci_enable_msi_range); * with new allocated MSI-X interrupts. **/ int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, - int minvec, int maxvec) + int minvec, int maxvec) { - int nvec = maxvec; - int rc; - - if (maxvec < minvec) - return -ERANGE; - - do { - rc = pci_enable_msix(dev, entries, nvec); - if (rc < 0) { - return rc; - } else if (rc > 0) { - if (rc < minvec) - return -ENOSPC; - nvec = rc; - } - } while (rc); - - return nvec; + return __pci_enable_msix_range(dev, entries, minvec, maxvec, + PCI_IRQ_NOAFFINITY); } EXPORT_SYMBOL(pci_enable_msix_range); @@ -1145,13 +1188,14 @@ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, int vecs = -ENOSPC; if (!(flags & PCI_IRQ_NOMSIX)) { - vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs); + vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs, + flags); if (vecs > 0) return vecs; } if (!(flags & PCI_IRQ_NOMSI)) { - vecs = pci_enable_msi_range(dev, min_vecs, max_vecs); + vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags); if (vecs > 0) return vecs; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 52ecd49e8049..f1406619f868 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -320,6 +320,7 @@ struct pci_dev { * directly, use the values stored here. They might be different! */ unsigned int irq; + struct cpumask *irq_affinity; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ bool match_driver; /* Skip attaching driver */ @@ -1240,6 +1241,7 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */ #define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */ #define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */ +#define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */ /* kmem_cache style wrapper around pci_alloc_consistent() */ -- cgit v1.2.3 From e16b46605960bd071a3e26f316e0bb600ae91e37 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 21 Jul 2016 21:40:28 -0600 Subject: PCI: Allow additional bus numbers for hotplug bridges A user may hot add a switch requiring more than one bus to enumerate. This previously required a system reboot if BIOS did not sufficiently pad the bus resource, which they frequently don't do. Add a kernel parameter so a user can specify the minimum number of bus numbers to reserve for a hotplug bridge's subordinate buses so rebooting won't be necessary. The default is 1, which is equivalent to previous behavior. Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas --- Documentation/kernel-parameters.txt | 3 +++ drivers/pci/pci.c | 8 ++++++++ drivers/pci/probe.c | 9 +++++++++ include/linux/pci.h | 1 + 4 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 82b42c958d1c..487e799b1da2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3016,6 +3016,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. hpmemsize=nn[KMG] The fixed amount of bus space which is reserved for hotplug bridge's memory window. Default size is 2 megabytes. + hpbussize=nn The minimum amount of additional bus numbers + reserved for buses below a hotplug bridge. + Default is 1. realloc= Enable/disable reallocating PCI bridge resources if allocations done by BIOS are too small to accommodate resources required by all child diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c8b4dbdd1bdd..f18ea90cf91a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -81,6 +81,9 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; +#define DEFAULT_HOTPLUG_BUS_SIZE 1 +unsigned long pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; + enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT; /* @@ -5021,6 +5024,11 @@ static int __init pci_setup(char *str) pci_hotplug_io_size = memparse(str + 9, &str); } else if (!strncmp(str, "hpmemsize=", 10)) { pci_hotplug_mem_size = memparse(str + 10, &str); + } else if (!strncmp(str, "hpbussize=", 10)) { + pci_hotplug_bus_size = + simple_strtoul(str + 10, &str, 0); + if (pci_hotplug_bus_size > 0xff) + pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE; } else if (!strncmp(str, "pcie_bus_tune_off", 17)) { pcie_bus_config = PCIE_BUS_TUNE_OFF; } else if (!strncmp(str, "pcie_bus_safe", 13)) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e3ef720997d..f680099c110d 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2076,6 +2076,15 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus) max = pci_scan_bridge(bus, dev, max, pass); } + /* + * Make sure a hotplug bridge has at least the minimum requested + * number of buses. + */ + if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) { + if (max - bus->busn_res.start < pci_hotplug_bus_size - 1) + max = bus->busn_res.start + pci_hotplug_bus_size - 1; + } + /* * We've scanned the bus and so we know all about what's on * the other side of any bridges that may be on this bus plus diff --git a/include/linux/pci.h b/include/linux/pci.h index b67e4df20801..0c283254111d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1706,6 +1706,7 @@ extern u8 pci_cache_line_size; extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); -- cgit v1.2.3 From b2aa5d0bc86cb901cc6c8737cfff66360cbff00c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 7 Jun 2016 21:57:15 +0200 Subject: libceph: fix some missing includes - decode.h needs slab.h for kmalloc() - osd_client.h needs msgpool.h for struct ceph_msgpool - msgpool.h doesn't need messenger.h Signed-off-by: Ilya Dryomov --- include/linux/ceph/decode.h | 1 + include/linux/ceph/msgpool.h | 1 - include/linux/ceph/osd_client.h | 1 + net/ceph/msgpool.c | 1 + 4 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 19e9932f3e77..e83f3c81ef43 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -3,6 +3,7 @@ #include #include +#include #include #include diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h index 4b0d38960726..ddd0d48d0384 100644 --- a/include/linux/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h @@ -2,7 +2,6 @@ #define _FS_CEPH_MSGPOOL #include -#include /* * we use memory pools for preallocating messages we may receive, to diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1b3b6e155392..858932304260 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index ddec1c10ac80..aaed59a47b1d 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -5,6 +5,7 @@ #include #include +#include #include static void *msgpool_alloc(gfp_t gfp_mask, void *arg) -- cgit v1.2.3 From 281dbe5db81c6137def9757e07a7aea14b1ed86e Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 26 Jul 2016 15:22:35 +0200 Subject: libceph: add an ONSTACK initializer for oids An on-stack oid in ceph_ioctl_get_dataloc() is not initialized, resulting in a WARN and a NULL pointer dereference later on. We will have more of these on-stack in the future, so fix it with a convenience macro. Fixes: d30291b985d1 ("libceph: variable-sized ceph_object_id") Signed-off-by: Ilya Dryomov --- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index be6b1657b1af..0946f2d4a81f 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; struct ceph_object_locator oloc; - struct ceph_object_id oid; + CEPH_DEFINE_OID_ONSTACK(oid); u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 9ccf4dbe55f8..21d7f048959f 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -115,6 +115,11 @@ static inline void ceph_oid_init(struct ceph_object_id *oid) oid->name_len = 0; } +#define CEPH_OID_INIT_ONSTACK(oid) \ + ({ ceph_oid_init(&oid); oid; }) +#define CEPH_DEFINE_OID_ONSTACK(oid) \ + struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid) + static inline bool ceph_oid_empty(const struct ceph_object_id *oid) { return oid->name == oid->inline_name && !oid->name_len; -- cgit v1.2.3 From 22748f9d617b8cd0a915c3a4c656c7232645b3b5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 2 Jun 2016 14:23:32 +0200 Subject: libceph: add start en/decoding block helpers Add ceph_start_encoding() and ceph_start_decoding(), the equivalent of ENCODE_START and DECODE_START in the userspace ceph code. This is based on a patch from Mike Christie . Signed-off-by: Ilya Dryomov --- include/linux/ceph/decode.h | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index e83f3c81ef43..f990f2cc907a 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -218,6 +218,60 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } +/* + * version and length starting block encoders/decoders + */ + +/* current code version (u8) + compat code version (u8) + len of struct (u32) */ +#define CEPH_ENCODING_START_BLK_LEN 6 + +/** + * ceph_start_encoding - start encoding block + * @struct_v: current (code) version of the encoding + * @struct_compat: oldest code version that can decode it + * @struct_len: length of struct encoding + */ +static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat, + u32 struct_len) +{ + ceph_encode_8(p, struct_v); + ceph_encode_8(p, struct_compat); + ceph_encode_32(p, struct_len); +} + +/** + * ceph_start_decoding - start decoding block + * @v: current version of the encoding that the code supports + * @name: name of the struct (free-form) + * @struct_v: out param for the encoding version + * @struct_len: out param for the length of struct encoding + * + * Validates the length of struct encoding, so unsafe ceph_decode_* + * variants can be used for decoding. + */ +static inline int ceph_start_decoding(void **p, void *end, u8 v, + const char *name, u8 *struct_v, + u32 *struct_len) +{ + u8 struct_compat; + + ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad); + *struct_v = ceph_decode_8(p); + struct_compat = ceph_decode_8(p); + if (v < struct_compat) { + pr_warn("got struct_v %d struct_compat %d > %d of %s\n", + *struct_v, struct_compat, v, name); + return -EINVAL; + } + + *struct_len = ceph_decode_32(p); + ceph_decode_need(p, end, *struct_len, bad); + return 0; + +bad: + return -ERANGE; +} + #define ceph_encode_need(p, end, n, bad) \ do { \ if (!likely(ceph_has_room(p, end, n))) \ -- cgit v1.2.3 From 7627151ea30bce2051e3cb27d7bb2c30083f86a5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 3 Feb 2016 21:24:49 +0800 Subject: libceph: define new ceph_file_layout structure Define new ceph_file_layout structure and rename old ceph_file_layout to ceph_file_layout_legacy. This is preparation for adding namespace to ceph_file_layout structure. Signed-off-by: Yan, Zheng --- drivers/block/rbd.c | 14 ++++++------- fs/ceph/addr.c | 18 ++++++++-------- fs/ceph/caps.c | 5 ++++- fs/ceph/file.c | 6 +++--- fs/ceph/inode.c | 7 ++++--- fs/ceph/ioctl.c | 22 +++++++++---------- fs/ceph/mds_client.h | 2 +- fs/ceph/xattr.c | 28 ++++++++++--------------- include/linux/ceph/ceph_fs.h | 50 +++++++++++++++++++------------------------- net/ceph/ceph_fs.c | 30 +++++++++++++++++++++++--- net/ceph/osd_client.c | 4 ++-- net/ceph/osdmap.c | 6 +++--- 12 files changed, 103 insertions(+), 89 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 81666a56415e..cc272ed46cfd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1937,7 +1937,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", obj_request->object_name)) goto fail; @@ -1991,7 +1991,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", obj_request->object_name)) goto fail; @@ -3995,10 +3995,10 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, /* Initialize the layout used for all rbd requests */ - rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); - rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; + rbd_dev->layout.stripe_count = 1; + rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; + rbd_dev->layout.pool_id = spec->pool_id; /* * If this is a mapping rbd_dev (as opposed to a parent one), @@ -5187,7 +5187,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); - rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; if (rbd_dev->image_format == 1) ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", spec->image_name, RBD_SUFFIX); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26a9d10d75e9..3f8efd866fec 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1730,7 +1730,7 @@ enum { POOL_WRITE = 2, }; -static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool) { struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1757,7 +1757,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) if (*p) goto out; - dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool); down_write(&mdsc->pool_perm_rwsem); parent = NULL; @@ -1860,13 +1860,13 @@ out_unlock: out: if (!err) err = have; - dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err); return err; } int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) { - u32 pool; + s64 pool; int ret, flags; /* does not support pool namespace yet */ @@ -1879,17 +1879,17 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) spin_lock(&ci->i_ceph_lock); flags = ci->i_ceph_flags; - pool = ceph_file_layout_pg_pool(ci->i_layout); + pool = ci->i_layout.pool_id; spin_unlock(&ci->i_ceph_lock); check: if (flags & CEPH_I_POOL_PERM) { if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { - dout("ceph_pool_perm_check pool %u no read perm\n", + dout("ceph_pool_perm_check pool %lld no read perm\n", pool); return -EPERM; } if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { - dout("ceph_pool_perm_check pool %u no write perm\n", + dout("ceph_pool_perm_check pool %lld no write perm\n", pool); return -EPERM; } @@ -1907,10 +1907,10 @@ check: flags |= CEPH_I_POOL_WR; spin_lock(&ci->i_ceph_lock); - if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { + if (pool == ci->i_layout.pool_id) { ci->i_ceph_flags = flags; } else { - pool = ceph_file_layout_pg_pool(ci->i_layout); + pool = ci->i_layout.pool_id; flags = ci->i_ceph_flags; } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6f60d0a3d0f9..f24722dce167 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2895,8 +2895,11 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ - ci->i_layout = grant->layout; + s64 old_pool = ci->i_layout.pool_id; + ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); ci->i_pool_ns_len = pool_ns_len; + if (ci->i_layout.pool_id != old_pool) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, issued, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0daaf7ceedc5..cba5dcf49a65 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1583,9 +1583,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) { int ret = 0; struct ceph_inode_info *ci = ceph_inode(inode); - s32 stripe_unit = ceph_file_layout_su(ci->i_layout); - s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - s32 object_size = ceph_file_layout_object_size(ci->i_layout); + s32 stripe_unit = ci->i_layout.stripe_unit; + s32 stripe_count = ci->i_layout.stripe_count; + s32 object_size = ci->i_layout.object_size; u64 object_set_size = object_size * stripe_count; u64 nearly, t; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f059b5997072..6c5903e0a976 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -814,10 +814,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { - if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) - ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; - ci->i_layout = info->layout; + s64 old_pool = ci->i_layout.pool_id; + ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); ci->i_pool_ns_len = iinfo->pool_ns_len; + if (ci->i_layout.pool_id != old_pool) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 0946f2d4a81f..843dd31a02cd 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (!err) { - l.stripe_unit = ceph_file_layout_su(ci->i_layout); - l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - l.object_size = ceph_file_layout_object_size(ci->i_layout); - l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + l.stripe_unit = ci->i_layout.stripe_unit; + l.stripe_count = ci->i_layout.stripe_count; + l.object_size = ci->i_layout.object_size; + l.data_pool = ci->i_layout.pool_id; l.preferred_osd = (s32)-1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; @@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) if (l.stripe_count) nl.stripe_count = l.stripe_count; else - nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.stripe_count = ci->i_layout.stripe_count; if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; else - nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_unit = ci->i_layout.stripe_unit; if (l.object_size) nl.object_size = l.object_size; else - nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.object_size = ci->i_layout.object_size; if (l.data_pool) nl.data_pool = l.data_pool; else - nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ nl.preferred_osd = le64_to_cpu(-1); @@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EIO; } dl.file_offset -= dl.object_offset; - dl.object_size = ceph_file_layout_object_size(ci->i_layout); - dl.block_size = ceph_file_layout_su(ci->i_layout); + dl.object_size = ci->i_layout.object_size; + dl.block_size = ci->i_layout.stripe_unit; /* block_offset = object_offset % block_size */ tmp = dl.object_offset; @@ -212,7 +212,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + oloc.pool = ci->i_layout.pool_id; ceph_oid_printf(&oid, "%s", dl.object_name); r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e7d38aac7109..75ecf967d0b7 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -275,8 +275,8 @@ struct ceph_mds_request { struct ceph_pool_perm { struct rb_node node; - u32 pool; int perm; + s64 pool; }; /* diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 4870b29df224..5377c9c7a0c5 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -72,7 +72,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + s64 pool = ci->i_layout.pool_id; const char *pool_name; char buf[128]; @@ -82,10 +82,9 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, if (pool_name) { size_t len = strlen(pool_name); ret = snprintf(buf, sizeof(buf), - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + "stripe_unit=%u stripe_count=%u object_size=%u pool=", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size); if (!size) { ret += len; } else if (ret + len > size) { @@ -97,11 +96,9 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } } else { ret = snprintf(buf, sizeof(buf), - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - (unsigned long long)pool); + "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size, (unsigned long long)pool); if (size) { if (ret <= size) memcpy(val, buf, ret); @@ -116,22 +113,19 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.stripe_unit); } static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.stripe_count); } static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.object_size); } static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, @@ -140,7 +134,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + s64 pool = ci->i_layout.pool_id; const char *pool_name; down_read(&osdc->lock); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index dfce616002ad..e5a5fb9ca3f5 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -34,9 +34,9 @@ #define CEPH_MAX_MON 31 /* - * ceph_file_layout - describe data layout for a file/inode + * legacy ceph_file_layoute */ -struct ceph_file_layout { +struct ceph_file_layout_legacy { /* file -> object mapping */ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */ @@ -53,33 +53,25 @@ struct ceph_file_layout { __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); -#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) -#define ceph_file_layout_stripe_count(l) \ - ((__s32)le32_to_cpu((l).fl_stripe_count)) -#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) -#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) -#define ceph_file_layout_object_su(l) \ - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) -#define ceph_file_layout_pg_pool(l) \ - ((__s32)le32_to_cpu((l).fl_pg_pool)) - -static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_stripe_unit) * - le32_to_cpu(l->fl_stripe_count); -} - -/* "period" == bytes before i start on a new set of objects */ -static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_object_size) * - le32_to_cpu(l->fl_stripe_count); -} +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + u32 stripe_unit; /* stripe unit, in bytes */ + u32 stripe_count; /* over this many objects */ + u32 object_size; /* until objects are this big */ + s64 pool_id; /* rados pool id */ +}; + +extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy); +extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy); #define CEPH_MIN_STRIPE_UNIT 65536 -int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); - struct ceph_dir_layout { __u8 dl_dir_hash; /* see ceph_hash.h for ids */ __u8 dl_unused1; @@ -399,7 +391,7 @@ union ceph_mds_request_args { __le32 flags; } __attribute__ ((packed)) setxattr; struct { - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; } __attribute__ ((packed)) setlayout; struct { __u8 rule; /* currently fcntl or flock */ @@ -478,7 +470,7 @@ struct ceph_mds_reply_inode { __le64 version; /* inode version */ __le64 xattr_version; /* version for xattr blob */ struct ceph_mds_reply_cap cap; /* caps issued for this inode */ - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; struct ceph_timespec ctime, mtime, atime; __le32 time_warp_seq; __le64 size, max_size, truncate_size; @@ -673,7 +665,7 @@ struct ceph_mds_caps { __le64 size, max_size, truncate_size; __le32 truncate_seq; struct ceph_timespec mtime, atime, ctime; - struct ceph_file_layout layout; + struct ceph_file_layout_legacy layout; __le32 time_warp_seq; } __attribute__ ((packed)); diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 41466ccb972a..7d54e944de5e 100644 --- a/net/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -9,9 +9,9 @@ */ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) { - __u32 su = le32_to_cpu(layout->fl_stripe_unit); - __u32 sc = le32_to_cpu(layout->fl_stripe_count); - __u32 os = le32_to_cpu(layout->fl_object_size); + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; /* stripe unit, object size must be non-zero, 64k increment */ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) @@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) return 1; } +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0) + fl->pool_id = -1; +} +EXPORT_SYMBOL(ceph_file_layout_from_legacy); + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} +EXPORT_SYMBOL(ceph_file_layout_to_legacy); int ceph_flags_to_mode(int flags) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 89469592076c..23efcac80072 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -932,7 +932,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); } else { - u32 object_size = le32_to_cpu(layout->fl_object_size); + u32 object_size = layout->object_size; u32 object_base = off - objoff; if (!(truncate_seq == 1 && truncate_size == -1ULL)) { if (truncate_size <= object_base) { @@ -948,7 +948,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } req->r_flags = flags; - req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + req->r_base_oloc.pool = layout->pool_id; ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_snapid = vino.snap; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7e480bf75bcf..5947b2e9fb8e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1770,9 +1770,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *ono, u64 *oxoff, u64 *oxlen) { - u32 osize = le32_to_cpu(layout->fl_object_size); - u32 su = le32_to_cpu(layout->fl_stripe_unit); - u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 osize = layout->object_size; + u32 su = layout->stripe_unit; + u32 sc = layout->stripe_count; u32 bl, stripeno, stripepos, objsetno; u32 su_per_object; u64 t, su_offset; -- cgit v1.2.3 From 51e9273796a57c08801f45580d3db3c51987a0cb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 5 Feb 2016 15:36:22 +0800 Subject: libceph: introduce reference counted string The data structure is for storing namesapce string. It allows namespace string to be shared between cephfs inodes with same layout. This data structure can also be referenced by OSD request. Signed-off-by: Yan, Zheng --- include/linux/ceph/libceph.h | 1 + include/linux/ceph/string_table.h | 62 +++++++++++++++++++++ net/ceph/Makefile | 2 +- net/ceph/ceph_common.c | 2 + net/ceph/string_table.c | 111 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 include/linux/ceph/string_table.h create mode 100644 net/ceph/string_table.c (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 690985daad1c..6afc5d98fad1 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -21,6 +21,7 @@ #include #include #include +#include /* * mount options diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h new file mode 100644 index 000000000000..1b02c96daf75 --- /dev/null +++ b/include/linux/ceph/string_table.h @@ -0,0 +1,62 @@ +#ifndef _FS_CEPH_STRING_TABLE_H +#define _FS_CEPH_STRING_TABLE_H + +#include +#include +#include +#include + +struct ceph_string { + struct kref kref; + union { + struct rb_node node; + struct rcu_head rcu; + }; + size_t len; + char str[]; +}; + +extern void ceph_release_string(struct kref *ref); +extern struct ceph_string *ceph_find_or_create_string(const char *str, + size_t len); +extern bool ceph_strings_empty(void); + +static inline struct ceph_string *ceph_get_string(struct ceph_string *str) +{ + kref_get(&str->kref); + return str; +} + +static inline void ceph_put_string(struct ceph_string *str) +{ + if (!str) + return; + kref_put(&str->kref, ceph_release_string); +} + +static inline int ceph_compare_string(struct ceph_string *cs, + const char* str, size_t len) +{ + size_t cs_len = cs ? cs->len : 0; + if (cs_len != len) + return cs_len - len; + if (len == 0) + return 0; + return strncmp(cs->str, str, len); +} + +#define ceph_try_get_string(x) \ +({ \ + struct ceph_string *___str; \ + rcu_read_lock(); \ + for (;;) { \ + ___str = rcu_dereference(x); \ + if (!___str || \ + kref_get_unless_zero(&___str->kref)) \ + break; \ + } \ + rcu_read_unlock(); \ + (___str); \ +}) + +#endif diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 958d9856912c..84cbed630c4b 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o + pagevec.o snapshot.o string_table.o diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 55d2bfee16d7..bddfcf6f09c2 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -747,6 +747,8 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + WARN_ON(!ceph_strings_empty()); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c new file mode 100644 index 000000000000..ca53c8319209 --- /dev/null +++ b/net/ceph/string_table.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(string_tree_lock); +static struct rb_root string_tree = RB_ROOT; + +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len) +{ + struct ceph_string *cs, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&string_tree_lock); + p = &string_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist && !kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + exist = NULL; + } + spin_unlock(&string_tree_lock); + if (exist) + return exist; + + cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS); + if (!cs) + return NULL; + + kref_init(&cs->kref); + cs->len = len; + memcpy(cs->str, str, len); + cs->str[len] = 0; + +retry: + exist = NULL; + parent = NULL; + p = &string_tree.rb_node; + spin_lock(&string_tree_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + ret = 0; + if (!exist) { + rb_link_node(&cs->node, parent, p); + rb_insert_color(&cs->node, &string_tree); + } else if (!kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + ret = -EAGAIN; + } + spin_unlock(&string_tree_lock); + if (ret == -EAGAIN) + goto retry; + + if (exist) { + kfree(cs); + cs = exist; + } + + return cs; +} +EXPORT_SYMBOL(ceph_find_or_create_string); + +static void ceph_free_string(struct rcu_head *head) +{ + struct ceph_string *cs = container_of(head, struct ceph_string, rcu); + kfree(cs); +} + +void ceph_release_string(struct kref *ref) +{ + struct ceph_string *cs = container_of(ref, struct ceph_string, kref); + + spin_lock(&string_tree_lock); + if (!RB_EMPTY_NODE(&cs->node)) { + rb_erase(&cs->node, &string_tree); + RB_CLEAR_NODE(&cs->node); + } + spin_unlock(&string_tree_lock); + + call_rcu(&cs->rcu, ceph_free_string); +} +EXPORT_SYMBOL(ceph_release_string); + +bool ceph_strings_empty(void) +{ + return RB_EMPTY_ROOT(&string_tree); +} -- cgit v1.2.3 From 30c156d9951e0aa88202707d80c583b0a09d3167 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 14 Feb 2016 11:24:31 +0800 Subject: libceph: rados pool namespace support Add pool namesapce pointer to struct ceph_file_layout and struct ceph_object_locator. Pool namespace is used by when mapping object to PG, it's also used when composing OSD request. The namespace pointer in struct ceph_file_layout is RCU protected. So libceph can read namespace without taking lock. Signed-off-by: Yan, Zheng [idryomov@gmail.com: ceph_oloc_destroy(), misc minor changes] Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 1 + fs/ceph/inode.c | 3 +++ include/linux/ceph/ceph_fs.h | 2 ++ include/linux/ceph/osdmap.h | 10 ++++----- net/ceph/debugfs.c | 12 ++++++++-- net/ceph/osd_client.c | 32 +++++++++++++++++++++------ net/ceph/osdmap.c | 52 +++++++++++++++++++++++++++++++++++++++----- 7 files changed, 92 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cc272ed46cfd..58fd02d4e534 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3999,6 +3999,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->layout.stripe_count = 1; rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; rbd_dev->layout.pool_id = spec->pool_id; + RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); /* * If this is a mapping rbd_dev (as opposed to a parent one), diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 6c5903e0a976..d035e0ab6029 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -446,6 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); ci->i_pool_ns_len = 0; ci->i_fragtree = RB_ROOT; @@ -570,6 +571,8 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ceph_put_string(ci->i_layout.pool_ns); + call_rcu(&inode->i_rcu, ceph_i_callback); } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index e5a5fb9ca3f5..08b8fd72261e 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -53,6 +53,7 @@ struct ceph_file_layout_legacy { __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); +struct ceph_string; /* * ceph_file_layout - describe data layout for a file/inode */ @@ -62,6 +63,7 @@ struct ceph_file_layout { u32 stripe_count; /* over this many objects */ u32 object_size; /* until objects are this big */ s64 pool_id; /* rados pool id */ + struct ceph_string __rcu *pool_ns; /* rados pool namespace */ }; extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 21d7f048959f..9a9041784dcf 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -63,11 +63,13 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) struct ceph_object_locator { s64 pool; + struct ceph_string *pool_ns; }; static inline void ceph_oloc_init(struct ceph_object_locator *oloc) { oloc->pool = -1; + oloc->pool_ns = NULL; } static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) @@ -75,11 +77,9 @@ static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) return oloc->pool == -1; } -static inline void ceph_oloc_copy(struct ceph_object_locator *dest, - const struct ceph_object_locator *src) -{ - dest->pool = src->pool; -} +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src); +void ceph_oloc_destroy(struct ceph_object_locator *oloc); /* * Maximum supported by kernel client object name length diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index e77b04ca7802..c62b2b029a6e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) seq_printf(s, "]/%d\t[", t->up.primary); for (i = 0; i < t->acting.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); - seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, - t->target_oid.name_len, t->target_oid.name, t->flags); + seq_printf(s, "]/%d\t", t->acting.primary); + if (t->target_oloc.pool_ns) { + seq_printf(s, "%*pE/%*pE\t0x%x", + (int)t->target_oloc.pool_ns->len, + t->target_oloc.pool_ns->str, + t->target_oid.name_len, t->target_oid.name, t->flags); + } else { + seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len, + t->target_oid.name, t->flags); + } if (t->paused) seq_puts(s, "\tP"); } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 23efcac80072..b68cc15e9cdc 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); + ceph_oloc_destroy(&t->base_oloc); ceph_oid_destroy(&t->target_oid); + ceph_oloc_destroy(&t->target_oloc); } /* @@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); +static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) +{ + return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); +} + int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) { struct ceph_osd_client *osdc = req->r_osdc; @@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) int msg_size; WARN_ON(ceph_oid_empty(&req->r_base_oid)); + WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pgid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); @@ -949,6 +958,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; + req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_snapid = vino.snap; @@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) p += sizeof(req->r_replay_version); /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); + ceph_start_encoding(&p, 5, 4, + ceph_oloc_encoding_size(&req->r_t.target_oloc)); ceph_encode_64(&p, req->r_t.target_oloc.pool); ceph_encode_32(&p, -1); /* preferred */ ceph_encode_32(&p, 0); /* key len */ + if (req->r_t.target_oloc.pool_ns) + ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, + req->r_t.target_oloc.pool_ns->len); + else + ceph_encode_32(&p, 0); /* pgid */ ceph_encode_8(&p, 1); @@ -2596,8 +2610,8 @@ static int ceph_oloc_decode(void **p, void *end, if (struct_v >= 5) { len = ceph_decode_32(p); if (len > 0) { - pr_warn("ceph_object_locator::nspace is set\n"); - goto e_inval; + ceph_decode_need(p, end, len, e_inval); + *p += len; } } @@ -2835,7 +2849,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) unlink_request(osd, req); mutex_unlock(&osd->lock); - ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); + /* + * Not ceph_oloc_copy() - changing pool_ns is not + * supported. + */ + req->r_t.target_oloc.pool = m.redirect.oloc.pool; req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; req->r_tid = 0; __submit_request(req, false); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 5947b2e9fb8e..d2436880b305 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1510,6 +1510,24 @@ bad: return ERR_PTR(err); } +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + WARN_ON(!ceph_oloc_empty(dest)); + WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ + + dest->pool = src->pool; + if (src->pool_ns) + dest->pool_ns = ceph_get_string(src->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_copy); + +void ceph_oloc_destroy(struct ceph_object_locator *oloc) +{ + ceph_put_string(oloc->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_destroy); + void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { @@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, if (!pi) return -ENOENT; - raw_pgid->pool = oloc->pool; - raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); - - dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, - raw_pgid->pool, raw_pgid->seed); + if (!oloc->pool_ns) { + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, + raw_pgid->pool, raw_pgid->seed); + } else { + char stack_buf[256]; + char *buf = stack_buf; + int nsl = oloc->pool_ns->len; + size_t total = nsl + 1 + oid->name_len; + + if (total > sizeof(stack_buf)) { + buf = kmalloc(total, GFP_NOIO); + if (!buf) + return -ENOMEM; + } + memcpy(buf, oloc->pool_ns->str, nsl); + buf[nsl] = '\037'; + memcpy(buf + nsl + 1, oid->name, oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); + if (buf != stack_buf) + kfree(buf); + dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, + oid->name, nsl, oloc->pool_ns->str, + raw_pgid->pool, raw_pgid->seed); + } return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); -- cgit v1.2.3 From 774a6a118c70f8c11fcfe636032b5016ad71a746 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 6 Jun 2016 16:01:39 +0800 Subject: ceph: reduce i_nr_by_mode array size Track usage count for individual fmode bit. This can reduce the array size by half. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 45 ++++++++++++++++++++++++++++++-------------- fs/ceph/inode.c | 2 +- fs/ceph/ioctl.c | 3 +-- fs/ceph/super.h | 7 ++----- include/linux/ceph/ceph_fs.h | 2 +- 5 files changed, 36 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0a9406a8a794..a08d245f16f5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -849,12 +849,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci) */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { - int want = 0; - int mode; - for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) - if (ci->i_nr_by_mode[mode]) - want |= ceph_caps_for_mode(mode); - return want; + int i, bits = 0; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (ci->i_nr_by_mode[i]) + bits |= 1 << i; + } + if (bits == 0) + return 0; + return ceph_caps_for_mode(bits >> 1); } /* @@ -3682,6 +3684,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } +void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) +{ + int i; + int bits = (fmode << 1) | 1; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) + ci->i_nr_by_mode[i]++; + } +} + /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule @@ -3689,15 +3701,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) { - struct inode *inode = &ci->vfs_inode; - int last = 0; - + int i, last = 0; + int bits = (fmode << 1) | 1; spin_lock(&ci->i_ceph_lock); - dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, - ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); - BUG_ON(ci->i_nr_by_mode[fmode] == 0); - if (--ci->i_nr_by_mode[fmode] == 0) - last++; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) { + BUG_ON(ci->i_nr_by_mode[i] == 0); + if (--ci->i_nr_by_mode[i] == 0) + last++; + } + } + dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", + &ci->vfs_inode, fmode, + ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], + ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index dc032566ed71..8ca843371d4b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_head_snapc = NULL; ci->i_snap_caps = 0; - for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) ci->i_nr_by_mode[i] = 0; mutex_init(&ci->i_truncate_mutex); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 6a30101b55ef..7d752d53353a 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -250,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file) if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&ci->i_ceph_lock); - ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; - ci->i_nr_by_mode[fi->fmode]++; + ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7ceab18c8ee2..50846e6f6a8c 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -321,7 +321,7 @@ struct ceph_inode_info { dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ - int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ @@ -906,10 +906,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); /* for counting open files by mode */ -static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) -{ - ci->i_nr_by_mode[mode]++; -} +extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 08b8fd72261e..6bc7730d22ac 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -525,7 +525,7 @@ struct ceph_filelock { #define CEPH_FILE_MODE_WR 2 #define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ #define CEPH_FILE_MODE_LAZY 4 /* lazy io */ -#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ +#define CEPH_FILE_MODE_BITS 4 int ceph_flags_to_mode(int flags); -- cgit v1.2.3 From 0cabbd94ff52c4803fc0ad9ad0ad5e43df493ab0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 7 Apr 2016 11:34:43 +0800 Subject: libceph: fsmap.user subscription support Signed-off-by: Yan, Zheng Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 1 + include/linux/ceph/mon_client.h | 7 ++++--- net/ceph/mon_client.c | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 6bc7730d22ac..7868d602c0a0 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -121,6 +121,7 @@ struct ceph_dir_layout { /* client <-> mds */ #define CEPH_MSG_MDS_MAP 21 +#define CEPH_MSG_FS_MAP_USER 103 #define CEPH_MSG_CLIENT_SESSION 22 #define CEPH_MSG_CLIENT_RECONNECT 23 diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e2a92df08b47..24d704d1ea5c 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -95,7 +95,7 @@ struct ceph_mon_client { struct ceph_mon_subscribe_item item; bool want; u32 have; /* epoch */ - } subs[3]; + } subs[4]; int fs_cluster_id; /* "mdsmap." sub */ #ifdef CONFIG_DEBUG_FS @@ -111,9 +111,10 @@ extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); enum { - CEPH_SUB_MDSMAP = 0, - CEPH_SUB_MONMAP, + CEPH_SUB_MONMAP = 0, CEPH_SUB_OSDMAP, + CEPH_SUB_FSMAP, + CEPH_SUB_MDSMAP, }; extern const char *ceph_sub_str[]; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 37c38a7fb5c5..c83326c5ba58 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc) } const char *ceph_sub_str[] = { - [CEPH_SUB_MDSMAP] = "mdsmap", [CEPH_SUB_MONMAP] = "monmap", [CEPH_SUB_OSDMAP] = "osdmap", + [CEPH_SUB_FSMAP] = "fsmap.user", + [CEPH_SUB_MDSMAP] = "mdsmap", }; /* @@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: + case CEPH_MSG_FS_MAP_USER: m = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!m) return NULL; /* ENOMEM--return skip == 0 */ -- cgit v1.2.3 From a0f2b65275413b3438e9f55b1427273cd893c3b2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Jun 2016 15:04:56 +0200 Subject: ceph: fix symbol versioning for ceph_monc_do_statfs The genksyms helper in the kernel cannot parse a type definition like "typeof(((type *)0)->keyfld)" that is used in the DEFINE_RB_FUNCS helper, causing the following EXPORT_SYMBOL() statement to be ignored when computing the crcs, and triggering a warning about this: WARNING: "ceph_monc_do_statfs" [fs/ceph/ceph.ko] has no CRC To work around the problem, we can rewrite the type to reference an undefined 'extern' symbol instead of a NULL pointer. This is evidently ok for genksyms, and it no longer complains about the line when calling it with 'genksyms -w'. I've looked briefly into extending genksyms instead, but it seems really hard to do. Jan Beulich introduced basic support for 'typeof' a while ago in dc53324060f3 ("genksyms: fix typeof() handling"), but that is not sufficient for the expression we have here. Signed-off-by: Arnd Bergmann Fixes: fcd00b68bbe2 ("libceph: DEFINE_RB_FUNCS macro") Cc: Jan Beulich Cc: Michal Marek Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 6afc5d98fad1..83fc1fff7061 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -215,8 +215,9 @@ static void erase_##name(struct rb_root *root, type *t) \ } #define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +extern type __lookup_##name##_key; \ static type *lookup_##name(struct rb_root *root, \ - typeof(((type *)0)->keyfld) key) \ + typeof(__lookup_##name##_key.keyfld) key) \ { \ struct rb_node *n = root->rb_node; \ \ -- cgit v1.2.3