From b94d5230d06eb930be82e67fb1a9a58271e78297 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 19 May 2015 22:54:31 -0400 Subject: libnvdimm, nfit: initial libnvdimm infrastructure and NFIT support A struct nvdimm_bus is the anchor device for registering nvdimm resources and interfaces, for example, a character control device, nvdimm devices, and I/O region devices. The ACPI NFIT (NVDIMM Firmware Interface Table) is one possible platform description for such non-volatile memory resources in a system. The nfit.ko driver attaches to the "ACPI0012" device that indicates the presence of the NFIT and parses the table to register a struct nvdimm_bus instance. Cc: Cc: Lv Zheng Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Jeff Moyer Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/Kconfig | 14 ++ drivers/acpi/Makefile | 1 + drivers/acpi/nfit.c | 481 ++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/acpi/nfit.h | 90 ++++++++++ 4 files changed, 586 insertions(+) create mode 100644 drivers/acpi/nfit.c create mode 100644 drivers/acpi/nfit.h (limited to 'drivers/acpi') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index ab2cbb51c6aa..300b4ef3712b 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -383,6 +383,20 @@ config ACPI_REDUCED_HARDWARE_ONLY If you are unsure what to do, do not enable this option. +config ACPI_NFIT + tristate "ACPI NVDIMM Firmware Interface Table (NFIT)" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + select LIBNVDIMM + help + Infrastructure to probe ACPI 6 compliant platforms for + NVDIMMs (NFIT) and register a libnvdimm device tree. In + addition to storage devices this also enables libnvdimm to pass + ACPI._DSM messages for platform/dimm configuration. + + To compile this driver as a module, choose M here: + the module will be called nfit. + source "drivers/acpi/apei/Kconfig" config ACPI_EXTLOG diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 8a063e276530..f7e9c92ccdcb 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o obj-$(CONFIG_ACPI_PROCESSOR) += processor.o obj-y += container.o obj-$(CONFIG_ACPI_THERMAL) += thermal.o +obj-$(CONFIG_ACPI_NFIT) += nfit.o obj-y += acpi_memhotplug.o obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o obj-$(CONFIG_ACPI_BATTERY) += battery.o diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c new file mode 100644 index 000000000000..9f9a20bae0ac --- /dev/null +++ b/drivers/acpi/nfit.c @@ -0,0 +1,481 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include "nfit.h" + +static u8 nfit_uuid[NFIT_UUID_MAX][16]; + +static const u8 *to_nfit_uuid(enum nfit_uuids id) +{ + return nfit_uuid[id]; +} + +static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len) +{ + return -ENOTTY; +} + +static const char *spa_type_name(u16 type) +{ + static const char *to_name[] = { + [NFIT_SPA_VOLATILE] = "volatile", + [NFIT_SPA_PM] = "pmem", + [NFIT_SPA_DCR] = "dimm-control-region", + [NFIT_SPA_BDW] = "block-data-window", + [NFIT_SPA_VDISK] = "volatile-disk", + [NFIT_SPA_VCD] = "volatile-cd", + [NFIT_SPA_PDISK] = "persistent-disk", + [NFIT_SPA_PCD] = "persistent-cd", + + }; + + if (type > NFIT_SPA_PCD) + return "unknown"; + + return to_name[type]; +} + +static int nfit_spa_type(struct acpi_nfit_system_address *spa) +{ + int i; + + for (i = 0; i < NFIT_UUID_MAX; i++) + if (memcmp(to_nfit_uuid(i), spa->range_guid, 16) == 0) + return i; + return -1; +} + +static bool add_spa(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct device *dev = acpi_desc->dev; + struct nfit_spa *nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa), + GFP_KERNEL); + + if (!nfit_spa) + return false; + INIT_LIST_HEAD(&nfit_spa->list); + nfit_spa->spa = spa; + list_add_tail(&nfit_spa->list, &acpi_desc->spas); + dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__, + spa->range_index, + spa_type_name(nfit_spa_type(spa))); + return true; +} + +static bool add_memdev(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_memory_map *memdev) +{ + struct device *dev = acpi_desc->dev; + struct nfit_memdev *nfit_memdev = devm_kzalloc(dev, + sizeof(*nfit_memdev), GFP_KERNEL); + + if (!nfit_memdev) + return false; + INIT_LIST_HEAD(&nfit_memdev->list); + nfit_memdev->memdev = memdev; + list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs); + dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n", + __func__, memdev->device_handle, memdev->range_index, + memdev->region_index); + return true; +} + +static bool add_dcr(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_control_region *dcr) +{ + struct device *dev = acpi_desc->dev; + struct nfit_dcr *nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr), + GFP_KERNEL); + + if (!nfit_dcr) + return false; + INIT_LIST_HEAD(&nfit_dcr->list); + nfit_dcr->dcr = dcr; + list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs); + dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__, + dcr->region_index, dcr->windows); + return true; +} + +static bool add_bdw(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_data_region *bdw) +{ + struct device *dev = acpi_desc->dev; + struct nfit_bdw *nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw), + GFP_KERNEL); + + if (!nfit_bdw) + return false; + INIT_LIST_HEAD(&nfit_bdw->list); + nfit_bdw->bdw = bdw; + list_add_tail(&nfit_bdw->list, &acpi_desc->bdws); + dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__, + bdw->region_index, bdw->windows); + return true; +} + +static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table, + const void *end) +{ + struct device *dev = acpi_desc->dev; + struct acpi_nfit_header *hdr; + void *err = ERR_PTR(-ENOMEM); + + if (table >= end) + return NULL; + + hdr = table; + switch (hdr->type) { + case ACPI_NFIT_TYPE_SYSTEM_ADDRESS: + if (!add_spa(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_MEMORY_MAP: + if (!add_memdev(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_CONTROL_REGION: + if (!add_dcr(acpi_desc, table)) + return err; + break; + case ACPI_NFIT_TYPE_DATA_REGION: + if (!add_bdw(acpi_desc, table)) + return err; + break; + /* TODO */ + case ACPI_NFIT_TYPE_INTERLEAVE: + dev_dbg(dev, "%s: idt\n", __func__); + break; + case ACPI_NFIT_TYPE_FLUSH_ADDRESS: + dev_dbg(dev, "%s: flush\n", __func__); + break; + case ACPI_NFIT_TYPE_SMBIOS: + dev_dbg(dev, "%s: smbios\n", __func__); + break; + default: + dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type); + break; + } + + return table + hdr->length; +} + +static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc, + struct nfit_mem *nfit_mem) +{ + u32 device_handle = __to_nfit_memdev(nfit_mem)->device_handle; + u16 dcr = nfit_mem->dcr->region_index; + struct nfit_spa *nfit_spa; + + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { + u16 range_index = nfit_spa->spa->range_index; + int type = nfit_spa_type(nfit_spa->spa); + struct nfit_memdev *nfit_memdev; + + if (type != NFIT_SPA_BDW) + continue; + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + if (nfit_memdev->memdev->range_index != range_index) + continue; + if (nfit_memdev->memdev->device_handle != device_handle) + continue; + if (nfit_memdev->memdev->region_index != dcr) + continue; + + nfit_mem->spa_bdw = nfit_spa->spa; + return; + } + } + + dev_dbg(acpi_desc->dev, "SPA-BDW not found for SPA-DCR %d\n", + nfit_mem->spa_dcr->range_index); + nfit_mem->bdw = NULL; +} + +static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc, + struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa) +{ + u16 dcr = __to_nfit_memdev(nfit_mem)->region_index; + struct nfit_dcr *nfit_dcr; + struct nfit_bdw *nfit_bdw; + + list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { + if (nfit_dcr->dcr->region_index != dcr) + continue; + nfit_mem->dcr = nfit_dcr->dcr; + break; + } + + if (!nfit_mem->dcr) { + dev_dbg(acpi_desc->dev, "SPA %d missing:%s%s\n", + spa->range_index, __to_nfit_memdev(nfit_mem) + ? "" : " MEMDEV", nfit_mem->dcr ? "" : " DCR"); + return -ENODEV; + } + + /* + * We've found enough to create an nvdimm, optionally + * find an associated BDW + */ + list_add(&nfit_mem->list, &acpi_desc->dimms); + + list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) { + if (nfit_bdw->bdw->region_index != dcr) + continue; + nfit_mem->bdw = nfit_bdw->bdw; + break; + } + + if (!nfit_mem->bdw) + return 0; + + nfit_mem_find_spa_bdw(acpi_desc, nfit_mem); + return 0; +} + +static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_mem *nfit_mem, *found; + struct nfit_memdev *nfit_memdev; + int type = nfit_spa_type(spa); + u16 dcr; + + switch (type) { + case NFIT_SPA_DCR: + case NFIT_SPA_PM: + break; + default: + return 0; + } + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + int rc; + + if (nfit_memdev->memdev->range_index != spa->range_index) + continue; + found = NULL; + dcr = nfit_memdev->memdev->region_index; + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) + if (__to_nfit_memdev(nfit_mem)->region_index == dcr) { + found = nfit_mem; + break; + } + + if (found) + nfit_mem = found; + else { + nfit_mem = devm_kzalloc(acpi_desc->dev, + sizeof(*nfit_mem), GFP_KERNEL); + if (!nfit_mem) + return -ENOMEM; + INIT_LIST_HEAD(&nfit_mem->list); + } + + if (type == NFIT_SPA_DCR) { + /* multiple dimms may share a SPA when interleaved */ + nfit_mem->spa_dcr = spa; + nfit_mem->memdev_dcr = nfit_memdev->memdev; + } else { + /* + * A single dimm may belong to multiple SPA-PM + * ranges, record at least one in addition to + * any SPA-DCR range. + */ + nfit_mem->memdev_pmem = nfit_memdev->memdev; + } + + if (found) + continue; + + rc = nfit_mem_add(acpi_desc, nfit_mem, spa); + if (rc) + return rc; + } + + return 0; +} + +static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b) +{ + struct nfit_mem *a = container_of(_a, typeof(*a), list); + struct nfit_mem *b = container_of(_b, typeof(*b), list); + u32 handleA, handleB; + + handleA = __to_nfit_memdev(a)->device_handle; + handleB = __to_nfit_memdev(b)->device_handle; + if (handleA < handleB) + return -1; + else if (handleA > handleB) + return 1; + return 0; +} + +static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) +{ + struct nfit_spa *nfit_spa; + + /* + * For each SPA-DCR or SPA-PMEM address range find its + * corresponding MEMDEV(s). From each MEMDEV find the + * corresponding DCR. Then, if we're operating on a SPA-DCR, + * try to find a SPA-BDW and a corresponding BDW that references + * the DCR. Throw it all into an nfit_mem object. Note, that + * BDWs are optional. + */ + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { + int rc; + + rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa); + if (rc) + return rc; + } + + list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp); + + return 0; +} + +static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) +{ + struct device *dev = acpi_desc->dev; + const void *end; + u8 *data; + + INIT_LIST_HEAD(&acpi_desc->spas); + INIT_LIST_HEAD(&acpi_desc->dcrs); + INIT_LIST_HEAD(&acpi_desc->bdws); + INIT_LIST_HEAD(&acpi_desc->memdevs); + INIT_LIST_HEAD(&acpi_desc->dimms); + + data = (u8 *) acpi_desc->nfit; + end = data + sz; + data += sizeof(struct acpi_table_nfit); + while (!IS_ERR_OR_NULL(data)) + data = add_table(acpi_desc, data, end); + + if (IS_ERR(data)) { + dev_dbg(dev, "%s: nfit table parsing error: %ld\n", __func__, + PTR_ERR(data)); + return PTR_ERR(data); + } + + if (nfit_mem_init(acpi_desc) != 0) + return -ENOMEM; + + return 0; +} + +static int acpi_nfit_add(struct acpi_device *adev) +{ + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; + struct device *dev = &adev->dev; + struct acpi_table_header *tbl; + acpi_status status = AE_OK; + acpi_size sz; + int rc; + + status = acpi_get_table_with_size("NFIT", 0, &tbl, &sz); + if (ACPI_FAILURE(status)) { + dev_err(dev, "failed to find NFIT\n"); + return -ENXIO; + } + + acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL); + if (!acpi_desc) + return -ENOMEM; + + dev_set_drvdata(dev, acpi_desc); + acpi_desc->dev = dev; + acpi_desc->nfit = (struct acpi_table_nfit *) tbl; + nd_desc = &acpi_desc->nd_desc; + nd_desc->provider_name = "ACPI.NFIT"; + nd_desc->ndctl = acpi_nfit_ctl; + + acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc); + if (!acpi_desc->nvdimm_bus) + return -ENXIO; + + rc = acpi_nfit_init(acpi_desc, sz); + if (rc) { + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + return rc; + } + return 0; +} + +static int acpi_nfit_remove(struct acpi_device *adev) +{ + struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev); + + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + return 0; +} + +static const struct acpi_device_id acpi_nfit_ids[] = { + { "ACPI0012", 0 }, + { "", 0 }, +}; +MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids); + +static struct acpi_driver acpi_nfit_driver = { + .name = KBUILD_MODNAME, + .ids = acpi_nfit_ids, + .ops = { + .add = acpi_nfit_add, + .remove = acpi_nfit_remove, + }, +}; + +static __init int nfit_init(void) +{ + BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40); + BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 56); + BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48); + BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20); + BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 9); + BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80); + BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40); + + acpi_str_to_uuid(UUID_VOLATILE_MEMORY, nfit_uuid[NFIT_SPA_VOLATILE]); + acpi_str_to_uuid(UUID_PERSISTENT_MEMORY, nfit_uuid[NFIT_SPA_PM]); + acpi_str_to_uuid(UUID_CONTROL_REGION, nfit_uuid[NFIT_SPA_DCR]); + acpi_str_to_uuid(UUID_DATA_REGION, nfit_uuid[NFIT_SPA_BDW]); + acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_VDISK]); + acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_CD, nfit_uuid[NFIT_SPA_VCD]); + acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_PDISK]); + acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_CD, nfit_uuid[NFIT_SPA_PCD]); + acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]); + acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]); + + return acpi_bus_register_driver(&acpi_nfit_driver); +} + +static __exit void nfit_exit(void) +{ + acpi_bus_unregister_driver(&acpi_nfit_driver); +} + +module_init(nfit_init); +module_exit(nfit_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h new file mode 100644 index 000000000000..c6baba64703f --- /dev/null +++ b/drivers/acpi/nfit.h @@ -0,0 +1,90 @@ +/* + * NVDIMM Firmware Interface Table - NFIT + * + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __NFIT_H__ +#define __NFIT_H__ +#include +#include +#include +#include +#include + +#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba" +#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" + +enum nfit_uuids { + NFIT_SPA_VOLATILE, + NFIT_SPA_PM, + NFIT_SPA_DCR, + NFIT_SPA_BDW, + NFIT_SPA_VDISK, + NFIT_SPA_VCD, + NFIT_SPA_PDISK, + NFIT_SPA_PCD, + NFIT_DEV_BUS, + NFIT_DEV_DIMM, + NFIT_UUID_MAX, +}; + +struct nfit_spa { + struct acpi_nfit_system_address *spa; + struct list_head list; +}; + +struct nfit_dcr { + struct acpi_nfit_control_region *dcr; + struct list_head list; +}; + +struct nfit_bdw { + struct acpi_nfit_data_region *bdw; + struct list_head list; +}; + +struct nfit_memdev { + struct acpi_nfit_memory_map *memdev; + struct list_head list; +}; + +/* assembled tables for a given dimm/memory-device */ +struct nfit_mem { + struct acpi_nfit_memory_map *memdev_dcr; + struct acpi_nfit_memory_map *memdev_pmem; + struct acpi_nfit_control_region *dcr; + struct acpi_nfit_data_region *bdw; + struct acpi_nfit_system_address *spa_dcr; + struct acpi_nfit_system_address *spa_bdw; + struct list_head list; +}; + +struct acpi_nfit_desc { + struct nvdimm_bus_descriptor nd_desc; + struct acpi_table_nfit *nfit; + struct list_head memdevs; + struct list_head dimms; + struct list_head spas; + struct list_head dcrs; + struct list_head bdws; + struct nvdimm_bus *nvdimm_bus; + struct device *dev; +}; + +static inline struct acpi_nfit_memory_map *__to_nfit_memdev( + struct nfit_mem *nfit_mem) +{ + if (nfit_mem->memdev_dcr) + return nfit_mem->memdev_dcr; + return nfit_mem->memdev_pmem; +} +#endif /* __NFIT_H__ */ -- cgit v1.2.3 From 45def22c1fab85764646746ce38d45b2f3281fa5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 26 Apr 2015 19:26:48 -0400 Subject: libnvdimm: control character device and nvdimm_bus sysfs attributes The control device for a nvdimm_bus is registered as an "nd" class device. The expectation is that there will usually only be one "nd" bus registered under /sys/class/nd. However, we allow for the possibility of multiple buses and they will listed in discovery order as ndctl0...ndctlN. This character device hosts the ioctl for passing control messages. The initial command set has a 1:1 correlation with the commands listed in the by the "NFIT DSM Example" document [1], but this scheme is extensible to future command sets. Note, nd_ioctl() and the backing ->ndctl() implementation are defined in a subsequent patch. This is simply the initial registrations and sysfs attributes. [1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Neil Brown Cc: Greg KH Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 28 +++++++++++++++ drivers/acpi/nfit.h | 6 ++++ drivers/nvdimm/Makefile | 1 + drivers/nvdimm/bus.c | 83 ++++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/core.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-- drivers/nvdimm/nd-core.h | 6 ++++ include/linux/libnvdimm.h | 6 +++- 7 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 drivers/nvdimm/bus.c (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 9f9a20bae0ac..162391b32022 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -354,6 +354,33 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc) return 0; } +static ssize_t revision_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + + return sprintf(buf, "%d\n", acpi_desc->nfit->header.revision); +} +static DEVICE_ATTR_RO(revision); + +static struct attribute *acpi_nfit_attributes[] = { + &dev_attr_revision.attr, + NULL, +}; + +static struct attribute_group acpi_nfit_attribute_group = { + .name = "nfit", + .attrs = acpi_nfit_attributes, +}; + +static const struct attribute_group *acpi_nfit_attribute_groups[] = { + &nvdimm_bus_attribute_group, + &acpi_nfit_attribute_group, + NULL, +}; + static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) { struct device *dev = acpi_desc->dev; @@ -410,6 +437,7 @@ static int acpi_nfit_add(struct acpi_device *adev) nd_desc = &acpi_desc->nd_desc; nd_desc->provider_name = "ACPI.NFIT"; nd_desc->ndctl = acpi_nfit_ctl; + nd_desc->attr_groups = acpi_nfit_attribute_groups; acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc); if (!acpi_desc->nvdimm_bus) diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index c6baba64703f..eda565fa81ef 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -87,4 +87,10 @@ static inline struct acpi_nfit_memory_map *__to_nfit_memdev( return nfit_mem->memdev_dcr; return nfit_mem->memdev_pmem; } + +static inline struct acpi_nfit_desc *to_acpi_desc( + struct nvdimm_bus_descriptor *nd_desc) +{ + return container_of(nd_desc, struct acpi_nfit_desc, nd_desc); +} #endif /* __NFIT_H__ */ diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 10bc7af47992..8a8293c72c7c 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o libnvdimm-y := core.o +libnvdimm-y += bus.o diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c new file mode 100644 index 000000000000..3f7c690a5d0c --- /dev/null +++ b/drivers/nvdimm/bus.c @@ -0,0 +1,83 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include "nd-core.h" + +static int nvdimm_bus_major; +static struct class *nd_class; + +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); + struct device *dev; + + dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus, + "ndctl%d", nvdimm_bus->id); + + if (IS_ERR(dev)) { + dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n", + nvdimm_bus->id, PTR_ERR(dev)); + return PTR_ERR(dev); + } + return 0; +} + +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id)); +} + +static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return -ENXIO; +} + +static const struct file_operations nvdimm_bus_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = nd_ioctl, + .compat_ioctl = nd_ioctl, + .llseek = noop_llseek, +}; + +int __init nvdimm_bus_init(void) +{ + int rc; + + rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops); + if (rc < 0) + return rc; + nvdimm_bus_major = rc; + + nd_class = class_create(THIS_MODULE, "nd"); + if (IS_ERR(nd_class)) + goto err_class; + + return 0; + + err_class: + unregister_chrdev(nvdimm_bus_major, "ndctl"); + + return rc; +} + +void __exit nvdimm_bus_exit(void) +{ + class_destroy(nd_class); + unregister_chrdev(nvdimm_bus_major, "ndctl"); +} diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index c578a49867ac..fa7ab5ad0318 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -14,9 +14,12 @@ #include #include #include +#include #include #include "nd-core.h" +static LIST_HEAD(nvdimm_bus_list); +static DEFINE_MUTEX(nvdimm_bus_list_mutex); static DEFINE_IDA(nd_ida); static void nvdimm_bus_release(struct device *dev) @@ -28,6 +31,55 @@ static void nvdimm_bus_release(struct device *dev) kfree(nvdimm_bus); } +struct nvdimm_bus *to_nvdimm_bus(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release); + return nvdimm_bus; +} +EXPORT_SYMBOL_GPL(to_nvdimm_bus); + +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus) +{ + /* struct nvdimm_bus definition is private to libnvdimm */ + return nvdimm_bus->nd_desc; +} +EXPORT_SYMBOL_GPL(to_nd_desc); + +static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + struct device *parent = nvdimm_bus->dev.parent; + + if (nd_desc->provider_name) + return nd_desc->provider_name; + else if (parent) + return dev_name(parent); + else + return "unknown"; +} + +static ssize_t provider_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + + return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus)); +} +static DEVICE_ATTR_RO(provider); + +static struct attribute *nvdimm_bus_attributes[] = { + &dev_attr_provider.attr, + NULL, +}; + +struct attribute_group nvdimm_bus_attribute_group = { + .attrs = nvdimm_bus_attributes, +}; +EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); + struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nd_desc) { @@ -37,6 +89,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL); if (!nvdimm_bus) return NULL; + INIT_LIST_HEAD(&nvdimm_bus->list); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); if (nvdimm_bus->id < 0) { kfree(nvdimm_bus); @@ -45,15 +98,26 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, nvdimm_bus->nd_desc = nd_desc; nvdimm_bus->dev.parent = parent; nvdimm_bus->dev.release = nvdimm_bus_release; + nvdimm_bus->dev.groups = nd_desc->attr_groups; dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id); rc = device_register(&nvdimm_bus->dev); if (rc) { dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc); - put_device(&nvdimm_bus->dev); - return NULL; + goto err; } + rc = nvdimm_bus_create_ndctl(nvdimm_bus); + if (rc) + goto err; + + mutex_lock(&nvdimm_bus_list_mutex); + list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list); + mutex_unlock(&nvdimm_bus_list_mutex); + return nvdimm_bus; + err: + put_device(&nvdimm_bus->dev); + return NULL; } EXPORT_SYMBOL_GPL(nvdimm_bus_register); @@ -61,9 +125,29 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) { if (!nvdimm_bus) return; + + mutex_lock(&nvdimm_bus_list_mutex); + list_del_init(&nvdimm_bus->list); + mutex_unlock(&nvdimm_bus_list_mutex); + + nvdimm_bus_destroy_ndctl(nvdimm_bus); + device_unregister(&nvdimm_bus->dev); } EXPORT_SYMBOL_GPL(nvdimm_bus_unregister); +static __init int libnvdimm_init(void) +{ + return nvdimm_bus_init(); +} + +static __exit void libnvdimm_exit(void) +{ + WARN_ON(!list_empty(&nvdimm_bus_list)); + nvdimm_bus_exit(); +} + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation"); +subsys_initcall(libnvdimm_init); +module_exit(libnvdimm_exit); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 291b2fdcd96b..0b0ff2423161 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -17,7 +17,13 @@ struct nvdimm_bus { struct nvdimm_bus_descriptor *nd_desc; + struct list_head list; struct device dev; int id; }; + +int __init nvdimm_bus_init(void); +void __exit nvdimm_bus_exit(void); +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); #endif /* __ND_CORE_H__ */ diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 2b3c63950c91..d375cdc4abd5 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,8 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +extern struct attribute_group nvdimm_bus_attribute_group; + struct nvdimm; struct nvdimm_bus_descriptor; typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, @@ -21,14 +23,16 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, unsigned int buf_len); struct nvdimm_bus_descriptor { + const struct attribute_group **attr_groups; unsigned long dsm_mask; char *provider_name; ndctl_fn ndctl; }; struct device; -struct nvdimm_bus; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); +struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From e6dfb2de47768efe8cc37c9a1863d2aff81440fb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 25 Apr 2015 03:56:17 -0400 Subject: libnvdimm, nfit: dimm/memory-devices Enable nvdimm devices to be registered on a nvdimm_bus. The kernel assigned device id for nvdimm devicesis dynamic. If userspace needs a more static identifier it should consult a provider-specific attribute. In the case where NFIT is the provider, the 'nmemX/nfit/handle' or 'nmemX/nfit/serial' attributes may be used for this purpose. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 161 ++++++++++++++++++++++++++++++++++++++++++++- drivers/acpi/nfit.h | 1 + drivers/nvdimm/Makefile | 1 + drivers/nvdimm/bus.c | 14 +++- drivers/nvdimm/core.c | 33 +++++++++- drivers/nvdimm/dimm_devs.c | 92 ++++++++++++++++++++++++++ drivers/nvdimm/nd-core.h | 12 ++++ include/linux/libnvdimm.h | 11 ++++ 8 files changed, 321 insertions(+), 4 deletions(-) create mode 100644 drivers/nvdimm/dimm_devs.c (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 162391b32022..9fd7781f966e 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -381,6 +381,165 @@ static const struct attribute_group *acpi_nfit_attribute_groups[] = { NULL, }; +static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + + return __to_nfit_memdev(nfit_mem); +} + +static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + + return nfit_mem->dcr; +} + +static ssize_t handle_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); + + return sprintf(buf, "%#x\n", memdev->device_handle); +} +static DEVICE_ATTR_RO(handle); + +static ssize_t phys_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev); + + return sprintf(buf, "%#x\n", memdev->physical_id); +} +static DEVICE_ATTR_RO(phys_id); + +static ssize_t vendor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->vendor_id); +} +static DEVICE_ATTR_RO(vendor); + +static ssize_t rev_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->revision_id); +} +static DEVICE_ATTR_RO(rev_id); + +static ssize_t device_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->device_id); +} +static DEVICE_ATTR_RO(device); + +static ssize_t format_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->code); +} +static DEVICE_ATTR_RO(format); + +static ssize_t serial_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev); + + return sprintf(buf, "%#x\n", dcr->serial_number); +} +static DEVICE_ATTR_RO(serial); + +static struct attribute *acpi_nfit_dimm_attributes[] = { + &dev_attr_handle.attr, + &dev_attr_phys_id.attr, + &dev_attr_vendor.attr, + &dev_attr_device.attr, + &dev_attr_format.attr, + &dev_attr_serial.attr, + &dev_attr_rev_id.attr, + NULL, +}; + +static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + if (to_nfit_dcr(dev)) + return a->mode; + else + return 0; +} + +static struct attribute_group acpi_nfit_dimm_attribute_group = { + .name = "nfit", + .attrs = acpi_nfit_dimm_attributes, + .is_visible = acpi_nfit_dimm_attr_visible, +}; + +static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = { + &acpi_nfit_dimm_attribute_group, + NULL, +}; + +static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, + u32 device_handle) +{ + struct nfit_mem *nfit_mem; + + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) + if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle) + return nfit_mem->nvdimm; + + return NULL; +} + +static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) +{ + struct nfit_mem *nfit_mem; + + list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { + struct nvdimm *nvdimm; + unsigned long flags = 0; + u32 device_handle; + + device_handle = __to_nfit_memdev(nfit_mem)->device_handle; + nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); + if (nvdimm) { + /* + * If for some reason we find multiple DCRs the + * first one wins + */ + dev_err(acpi_desc->dev, "duplicate DCR detected: %s\n", + nvdimm_name(nvdimm)); + continue; + } + + if (nfit_mem->bdw && nfit_mem->memdev_pmem) + flags |= NDD_ALIASING; + + nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, + acpi_nfit_dimm_attribute_groups, flags); + if (!nvdimm) + return -ENOMEM; + + nfit_mem->nvdimm = nvdimm; + } + + return 0; +} + static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) { struct device *dev = acpi_desc->dev; @@ -408,7 +567,7 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) if (nfit_mem_init(acpi_desc) != 0) return -ENOMEM; - return 0; + return acpi_nfit_register_dimms(acpi_desc); } static int acpi_nfit_add(struct acpi_device *adev) diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index eda565fa81ef..9dd437fe5563 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -59,6 +59,7 @@ struct nfit_memdev { /* assembled tables for a given dimm/memory-device */ struct nfit_mem { + struct nvdimm *nvdimm; struct acpi_nfit_memory_map *memdev_dcr; struct acpi_nfit_memory_map *memdev_pmem; struct acpi_nfit_control_region *dcr; diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 8a8293c72c7c..5b68738ba406 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o libnvdimm-y := core.o libnvdimm-y += bus.o +libnvdimm-y += dimm_devs.o diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 3f7c690a5d0c..a8802577fb55 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include #include @@ -21,6 +22,10 @@ static int nvdimm_bus_major; static struct class *nd_class; +struct bus_type nvdimm_bus_type = { + .name = "nd", +}; + int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) { dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); @@ -59,9 +64,13 @@ int __init nvdimm_bus_init(void) { int rc; + rc = bus_register(&nvdimm_bus_type); + if (rc) + return rc; + rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops); if (rc < 0) - return rc; + goto err_chrdev; nvdimm_bus_major = rc; nd_class = class_create(THIS_MODULE, "nd"); @@ -72,6 +81,8 @@ int __init nvdimm_bus_init(void) err_class: unregister_chrdev(nvdimm_bus_major, "ndctl"); + err_chrdev: + bus_unregister(&nvdimm_bus_type); return rc; } @@ -80,4 +91,5 @@ void __exit nvdimm_bus_exit(void) { class_destroy(nd_class); unregister_chrdev(nvdimm_bus_major, "ndctl"); + bus_unregister(&nvdimm_bus_type); } diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index fa7ab5ad0318..ef957eb37c90 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -18,8 +18,8 @@ #include #include "nd-core.h" -static LIST_HEAD(nvdimm_bus_list); -static DEFINE_MUTEX(nvdimm_bus_list_mutex); +LIST_HEAD(nvdimm_bus_list); +DEFINE_MUTEX(nvdimm_bus_list_mutex); static DEFINE_IDA(nd_ida); static void nvdimm_bus_release(struct device *dev) @@ -48,6 +48,19 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus) } EXPORT_SYMBOL_GPL(to_nd_desc); +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev) +{ + struct device *dev; + + for (dev = nd_dev; dev; dev = dev->parent) + if (dev->release == nvdimm_bus_release) + break; + dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n"); + if (dev) + return to_nvdimm_bus(dev); + return NULL; +} + static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus) { struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; @@ -121,6 +134,21 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, } EXPORT_SYMBOL_GPL(nvdimm_bus_register); +static int child_unregister(struct device *dev, void *data) +{ + /* + * the singular ndctl class device per bus needs to be + * "device_destroy"ed, so skip it here + * + * i.e. remove classless children + */ + if (dev->class) + /* pass */; + else + device_unregister(dev); + return 0; +} + void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) { if (!nvdimm_bus) @@ -130,6 +158,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) list_del_init(&nvdimm_bus->list); mutex_unlock(&nvdimm_bus_list_mutex); + device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); nvdimm_bus_destroy_ndctl(nvdimm_bus); device_unregister(&nvdimm_bus->dev); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c new file mode 100644 index 000000000000..51ea52cc2079 --- /dev/null +++ b/drivers/nvdimm/dimm_devs.c @@ -0,0 +1,92 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include "nd-core.h" + +static DEFINE_IDA(dimm_ida); + +static void nvdimm_release(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + ida_simple_remove(&dimm_ida, nvdimm->id); + kfree(nvdimm); +} + +static struct device_type nvdimm_device_type = { + .name = "nvdimm", + .release = nvdimm_release, +}; + +static bool is_nvdimm(struct device *dev) +{ + return dev->type == &nvdimm_device_type; +} + +struct nvdimm *to_nvdimm(struct device *dev) +{ + struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev); + + WARN_ON(!is_nvdimm(dev)); + return nvdimm; +} +EXPORT_SYMBOL_GPL(to_nvdimm); + +const char *nvdimm_name(struct nvdimm *nvdimm) +{ + return dev_name(&nvdimm->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_name); + +void *nvdimm_provider_data(struct nvdimm *nvdimm) +{ + return nvdimm->provider_data; +} +EXPORT_SYMBOL_GPL(nvdimm_provider_data); + +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags) +{ + struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL); + struct device *dev; + + if (!nvdimm) + return NULL; + + nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL); + if (nvdimm->id < 0) { + kfree(nvdimm); + return NULL; + } + nvdimm->provider_data = provider_data; + nvdimm->flags = flags; + + dev = &nvdimm->dev; + dev_set_name(dev, "nmem%d", nvdimm->id); + dev->parent = &nvdimm_bus->dev; + dev->type = &nvdimm_device_type; + dev->bus = &nvdimm_bus_type; + dev->groups = groups; + if (device_register(dev) != 0) { + put_device(dev); + return NULL; + } + + return nvdimm; +} +EXPORT_SYMBOL_GPL(nvdimm_create); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 0b0ff2423161..9b8303413b60 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -15,6 +15,10 @@ #include #include +extern struct list_head nvdimm_bus_list; +extern struct mutex nvdimm_bus_list_mutex; +extern struct bus_type nvdimm_bus_type; + struct nvdimm_bus { struct nvdimm_bus_descriptor *nd_desc; struct list_head list; @@ -22,6 +26,14 @@ struct nvdimm_bus { int id; }; +struct nvdimm { + unsigned long flags; + void *provider_data; + struct device dev; + int id; +}; + +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void __exit nvdimm_bus_exit(void); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d375cdc4abd5..07787f0dd7de 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,12 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ + +enum { + /* when a dimm supports both PMEM and BLK access a label is required */ + NDD_ALIASING = 1 << 0, +}; + extern struct attribute_group nvdimm_bus_attribute_group; struct nvdimm; @@ -34,5 +40,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); +struct nvdimm *to_nvdimm(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); +const char *nvdimm_name(struct nvdimm *nvdimm); +void *nvdimm_provider_data(struct nvdimm *nvdimm); +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 62232e45f4a265abb43f0acf16e58f5d0b6e1ec9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 8 Jun 2015 14:27:06 -0400 Subject: libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices Most discovery/configuration of the nvdimm-subsystem is done via sysfs attributes. However, some nvdimm_bus instances, particularly the ACPI.NFIT bus, define a small set of messages that can be passed to the platform. For convenience we derive the initial libnvdimm-ioctl command formats directly from the NFIT DSM Interface Example formats. ND_CMD_SMART: media health and diagnostics ND_CMD_GET_CONFIG_SIZE: size of the label space ND_CMD_GET_CONFIG_DATA: read label space ND_CMD_SET_CONFIG_DATA: write label space ND_CMD_VENDOR: vendor-specific command passthrough ND_CMD_ARS_CAP: report address-range-scrubbing capabilities ND_CMD_ARS_START: initiate scrubbing ND_CMD_ARS_STATUS: report on scrubbing state ND_CMD_SMART_THRESHOLD: configure alarm thresholds for smart events If a platform later defines different commands than this set it is straightforward to extend support to those formats. Most of the commands target a specific dimm. However, the address-range-scrubbing commands target the bus. The 'commands' attribute in sysfs of an nvdimm_bus, or nvdimm, enumerate the supported commands for that object. Cc: Cc: Robert Moore Cc: Rafael J. Wysocki Reported-by: Nicholas Moulin Acked-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/acpi/Kconfig | 12 ++ drivers/acpi/nfit.c | 216 +++++++++++++++++++++++++++++- drivers/acpi/nfit.h | 3 + drivers/nvdimm/bus.c | 326 ++++++++++++++++++++++++++++++++++++++++++++- drivers/nvdimm/core.c | 16 +++ drivers/nvdimm/dimm_devs.c | 38 +++++- drivers/nvdimm/nd-core.h | 3 + include/linux/libnvdimm.h | 27 +++- include/uapi/linux/Kbuild | 1 + include/uapi/linux/ndctl.h | 178 +++++++++++++++++++++++++ 10 files changed, 810 insertions(+), 10 deletions(-) create mode 100644 include/uapi/linux/ndctl.h (limited to 'drivers/acpi') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 300b4ef3712b..9c43ae301300 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -397,6 +397,18 @@ config ACPI_NFIT To compile this driver as a module, choose M here: the module will be called nfit. +config ACPI_NFIT_DEBUG + bool "NFIT DSM debug" + depends on ACPI_NFIT + depends on DYNAMIC_DEBUG + default n + help + Enabling this option causes the nfit driver to dump the + input and output buffers of _DSM operations on the ACPI0012 + device and its children. This can be very verbose, so leave + it disabled unless you are debugging a hardware / firmware + issue. + source "drivers/acpi/apei/Kconfig" config ACPI_EXTLOG diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 9fd7781f966e..9112a6210a4b 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include "nfit.h" @@ -24,11 +25,153 @@ static const u8 *to_nfit_uuid(enum nfit_uuids id) return nfit_uuid[id]; } +static struct acpi_nfit_desc *to_acpi_nfit_desc( + struct nvdimm_bus_descriptor *nd_desc) +{ + return container_of(nd_desc, struct acpi_nfit_desc, nd_desc); +} + +static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + + /* + * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct + * acpi_device. + */ + if (!nd_desc->provider_name + || strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0) + return NULL; + + return to_acpi_device(acpi_desc->dev); +} + static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len) { - return -ENOTTY; + struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc); + const struct nd_cmd_desc *desc = NULL; + union acpi_object in_obj, in_buf, *out_obj; + struct device *dev = acpi_desc->dev; + const char *cmd_name, *dimm_name; + unsigned long dsm_mask; + acpi_handle handle; + const u8 *uuid; + u32 offset; + int rc, i; + + if (nvdimm) { + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + struct acpi_device *adev = nfit_mem->adev; + + if (!adev) + return -ENOTTY; + dimm_name = dev_name(&adev->dev); + cmd_name = nvdimm_cmd_name(cmd); + dsm_mask = nfit_mem->dsm_mask; + desc = nd_cmd_dimm_desc(cmd); + uuid = to_nfit_uuid(NFIT_DEV_DIMM); + handle = adev->handle; + } else { + struct acpi_device *adev = to_acpi_dev(acpi_desc); + + cmd_name = nvdimm_bus_cmd_name(cmd); + dsm_mask = nd_desc->dsm_mask; + desc = nd_cmd_bus_desc(cmd); + uuid = to_nfit_uuid(NFIT_DEV_BUS); + handle = adev->handle; + dimm_name = "bus"; + } + + if (!desc || (cmd && (desc->out_num + desc->in_num == 0))) + return -ENOTTY; + + if (!test_bit(cmd, &dsm_mask)) + return -ENOTTY; + + in_obj.type = ACPI_TYPE_PACKAGE; + in_obj.package.count = 1; + in_obj.package.elements = &in_buf; + in_buf.type = ACPI_TYPE_BUFFER; + in_buf.buffer.pointer = buf; + in_buf.buffer.length = 0; + + /* libnvdimm has already validated the input envelope */ + for (i = 0; i < desc->in_num; i++) + in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc, + i, buf); + + if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { + dev_dbg(dev, "%s:%s cmd: %s input length: %d\n", __func__, + dimm_name, cmd_name, in_buf.buffer.length); + print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, + 4, in_buf.buffer.pointer, min_t(u32, 128, + in_buf.buffer.length), true); + } + + out_obj = acpi_evaluate_dsm(handle, uuid, 1, cmd, &in_obj); + if (!out_obj) { + dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name, + cmd_name); + return -EINVAL; + } + + if (out_obj->package.type != ACPI_TYPE_BUFFER) { + dev_dbg(dev, "%s:%s unexpected output object type cmd: %s type: %d\n", + __func__, dimm_name, cmd_name, out_obj->type); + rc = -EINVAL; + goto out; + } + + if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) { + dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__, + dimm_name, cmd_name, out_obj->buffer.length); + print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4, + 4, out_obj->buffer.pointer, min_t(u32, 128, + out_obj->buffer.length), true); + } + + for (i = 0, offset = 0; i < desc->out_num; i++) { + u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf, + (u32 *) out_obj->buffer.pointer); + + if (offset + out_size > out_obj->buffer.length) { + dev_dbg(dev, "%s:%s output object underflow cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + break; + } + + if (in_buf.buffer.length + offset + out_size > buf_len) { + dev_dbg(dev, "%s:%s output overrun cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + rc = -ENXIO; + goto out; + } + memcpy(buf + in_buf.buffer.length + offset, + out_obj->buffer.pointer + offset, out_size); + offset += out_size; + } + if (offset + in_buf.buffer.length < buf_len) { + if (i >= 1) { + /* + * status valid, return the number of bytes left + * unfilled in the output buffer + */ + rc = buf_len - offset - in_buf.buffer.length; + } else { + dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n", + __func__, dimm_name, cmd_name, buf_len, + offset); + rc = -ENXIO; + } + } else + rc = 0; + + out: + ACPI_FREE(out_obj); + + return rc; } static const char *spa_type_name(u16 type) @@ -489,6 +632,7 @@ static struct attribute_group acpi_nfit_dimm_attribute_group = { }; static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = { + &nvdimm_attribute_group, &acpi_nfit_dimm_attribute_group, NULL, }; @@ -505,6 +649,50 @@ static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc, return NULL; } +static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, + struct nfit_mem *nfit_mem, u32 device_handle) +{ + struct acpi_device *adev, *adev_dimm; + struct device *dev = acpi_desc->dev; + const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM); + unsigned long long sta; + int i, rc = -ENODEV; + acpi_status status; + + nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en; + adev = to_acpi_dev(acpi_desc); + if (!adev) + return 0; + + adev_dimm = acpi_find_child_device(adev, device_handle, false); + nfit_mem->adev = adev_dimm; + if (!adev_dimm) { + dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n", + device_handle); + return -ENODEV; + } + + status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta); + if (status == AE_NOT_FOUND) { + dev_dbg(dev, "%s missing _STA, assuming enabled...\n", + dev_name(&adev_dimm->dev)); + rc = 0; + } else if (ACPI_FAILURE(status)) + dev_err(dev, "%s failed to retrieve_STA, disabling...\n", + dev_name(&adev_dimm->dev)); + else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0) + dev_info(dev, "%s disabled by firmware\n", + dev_name(&adev_dimm->dev)); + else + rc = 0; + + for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++) + if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i)) + set_bit(i, &nfit_mem->dsm_mask); + + return rc; +} + static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; @@ -513,6 +701,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) struct nvdimm *nvdimm; unsigned long flags = 0; u32 device_handle; + int rc; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle); @@ -529,8 +718,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) if (nfit_mem->bdw && nfit_mem->memdev_pmem) flags |= NDD_ALIASING; + rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); + if (rc) + continue; + nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem, - acpi_nfit_dimm_attribute_groups, flags); + acpi_nfit_dimm_attribute_groups, + flags, &nfit_mem->dsm_mask); if (!nvdimm) return -ENOMEM; @@ -540,6 +734,22 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) return 0; } +static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) +{ + struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc; + const u8 *uuid = to_nfit_uuid(NFIT_DEV_BUS); + struct acpi_device *adev; + int i; + + adev = to_acpi_dev(acpi_desc); + if (!adev) + return; + + for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++) + if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i)) + set_bit(i, &nd_desc->dsm_mask); +} + static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) { struct device *dev = acpi_desc->dev; @@ -567,6 +777,8 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) if (nfit_mem_init(acpi_desc) != 0) return -ENOMEM; + acpi_nfit_init_dsms(acpi_desc); + return acpi_nfit_register_dimms(acpi_desc); } diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index 9dd437fe5563..b76e33629098 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -67,6 +67,8 @@ struct nfit_mem { struct acpi_nfit_system_address *spa_dcr; struct acpi_nfit_system_address *spa_bdw; struct list_head list; + struct acpi_device *adev; + unsigned long dsm_mask; }; struct acpi_nfit_desc { @@ -79,6 +81,7 @@ struct acpi_nfit_desc { struct list_head bdws; struct nvdimm_bus *nvdimm_bus; struct device *dev; + unsigned long dimm_dsm_force_en; }; static inline struct acpi_nfit_memory_map *__to_nfit_memdev( diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index a8802577fb55..15f3a3ddc225 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -11,14 +11,18 @@ * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include +#include #include #include #include +#include #include "nd-core.h" +int nvdimm_major; static int nvdimm_bus_major; static struct class *nd_class; @@ -47,19 +51,325 @@ void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus) device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id)); } +static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_SMART] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_SMART_THRESHOLD] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_DIMM_FLAGS] = { + .out_num = 2, + .out_sizes = { 4, 4 }, + }, + [ND_CMD_GET_CONFIG_SIZE] = { + .out_num = 3, + .out_sizes = { 4, 4, 4, }, + }, + [ND_CMD_GET_CONFIG_DATA] = { + .in_num = 2, + .in_sizes = { 4, 4, }, + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, + [ND_CMD_SET_CONFIG_DATA] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_VENDOR] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 3, + .out_sizes = { 4, 4, UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs)) + return &__nd_cmd_dimm_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc); + +static const struct nd_cmd_desc __nd_cmd_bus_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_ARS_CAP] = { + .in_num = 2, + .in_sizes = { 8, 8, }, + .out_num = 2, + .out_sizes = { 4, 4, }, + }, + [ND_CMD_ARS_START] = { + .in_num = 4, + .in_sizes = { 8, 8, 2, 6, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_ARS_STATUS] = { + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs)) + return &__nd_cmd_bus_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_bus_desc); + +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf) +{ + if (idx >= desc->in_num) + return UINT_MAX; + + if (desc->in_sizes[idx] < UINT_MAX) + return desc->in_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) { + struct nd_cmd_set_config_hdr *hdr = buf; + + return hdr->in_length; + } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) { + struct nd_cmd_vendor_hdr *hdr = buf; + + return hdr->in_length; + } + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_in_size); + +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field) +{ + if (idx >= desc->out_num) + return UINT_MAX; + + if (desc->out_sizes[idx] < UINT_MAX) + return desc->out_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1) + return in_field[1]; + else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) + return out_field[1]; + else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1) + return ND_CMD_ARS_STATUS_MAX; + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_out_size); + +static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, + int read_only, unsigned int ioctl_cmd, unsigned long arg) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + size_t buf_len = 0, in_len = 0, out_len = 0; + static char out_env[ND_CMD_MAX_ENVELOPE]; + static char in_env[ND_CMD_MAX_ENVELOPE]; + const struct nd_cmd_desc *desc = NULL; + unsigned int cmd = _IOC_NR(ioctl_cmd); + void __user *p = (void __user *) arg; + struct device *dev = &nvdimm_bus->dev; + const char *cmd_name, *dimm_name; + unsigned long dsm_mask; + void *buf; + int rc, i; + + if (nvdimm) { + desc = nd_cmd_dimm_desc(cmd); + cmd_name = nvdimm_cmd_name(cmd); + dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0; + dimm_name = dev_name(&nvdimm->dev); + } else { + desc = nd_cmd_bus_desc(cmd); + cmd_name = nvdimm_bus_cmd_name(cmd); + dsm_mask = nd_desc->dsm_mask; + dimm_name = "bus"; + } + + if (!desc || (desc->out_num + desc->in_num == 0) || + !test_bit(cmd, &dsm_mask)) + return -ENOTTY; + + /* fail write commands (when read-only) */ + if (read_only) + switch (ioctl_cmd) { + case ND_IOCTL_VENDOR: + case ND_IOCTL_SET_CONFIG_DATA: + case ND_IOCTL_ARS_START: + dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n", + nvdimm ? nvdimm_cmd_name(cmd) + : nvdimm_bus_cmd_name(cmd)); + return -EPERM; + default: + break; + } + + /* process an input envelope */ + for (i = 0; i < desc->in_num; i++) { + u32 in_size, copy; + + in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env); + if (in_size == UINT_MAX) { + dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + return -ENXIO; + } + if (!access_ok(VERIFY_READ, p + in_len, in_size)) + return -EFAULT; + if (in_len < sizeof(in_env)) + copy = min_t(u32, sizeof(in_env) - in_len, in_size); + else + copy = 0; + if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) + return -EFAULT; + in_len += in_size; + } + + /* process an output envelope */ + for (i = 0; i < desc->out_num; i++) { + u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, + (u32 *) in_env, (u32 *) out_env); + u32 copy; + + if (out_size == UINT_MAX) { + dev_dbg(dev, "%s:%s unknown output size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + return -EFAULT; + } + if (!access_ok(VERIFY_WRITE, p + in_len + out_len, out_size)) + return -EFAULT; + if (out_len < sizeof(out_env)) + copy = min_t(u32, sizeof(out_env) - out_len, out_size); + else + copy = 0; + if (copy && copy_from_user(&out_env[out_len], + p + in_len + out_len, copy)) + return -EFAULT; + out_len += out_size; + } + + buf_len = out_len + in_len; + if (!access_ok(VERIFY_WRITE, p, sizeof(buf_len))) + return -EFAULT; + + if (buf_len > ND_IOCTL_MAX_BUFLEN) { + dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__, + dimm_name, cmd_name, buf_len, + ND_IOCTL_MAX_BUFLEN); + return -EINVAL; + } + + buf = vmalloc(buf_len); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, p, buf_len)) { + rc = -EFAULT; + goto out; + } + + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len); + if (rc < 0) + goto out; + if (copy_to_user(p, buf, buf_len)) + rc = -EFAULT; + out: + vfree(buf); + return rc; +} + static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - return -ENXIO; + long id = (long) file->private_data; + int rc = -ENXIO, read_only; + struct nvdimm_bus *nvdimm_bus; + + read_only = (O_RDWR != (file->f_flags & O_ACCMODE)); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + if (nvdimm_bus->id == id) { + rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg); + break; + } + } + mutex_unlock(&nvdimm_bus_list_mutex); + + return rc; +} + +static int match_dimm(struct device *dev, void *data) +{ + long id = (long) data; + + if (is_nvdimm(dev)) { + struct nvdimm *nvdimm = to_nvdimm(dev); + + return nvdimm->id == id; + } + + return 0; +} + +static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = -ENXIO, read_only; + struct nvdimm_bus *nvdimm_bus; + + read_only = (O_RDWR != (file->f_flags & O_ACCMODE)); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + struct device *dev = device_find_child(&nvdimm_bus->dev, + file->private_data, match_dimm); + struct nvdimm *nvdimm; + + if (!dev) + continue; + + nvdimm = to_nvdimm(dev); + rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg); + put_device(dev); + break; + } + mutex_unlock(&nvdimm_bus_list_mutex); + + return rc; +} + +static int nd_open(struct inode *inode, struct file *file) +{ + long minor = iminor(inode); + + file->private_data = (void *) minor; + return 0; } static const struct file_operations nvdimm_bus_fops = { .owner = THIS_MODULE, - .open = nonseekable_open, + .open = nd_open, .unlocked_ioctl = nd_ioctl, .compat_ioctl = nd_ioctl, .llseek = noop_llseek, }; +static const struct file_operations nvdimm_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = nvdimm_ioctl, + .compat_ioctl = nvdimm_ioctl, + .llseek = noop_llseek, +}; + int __init nvdimm_bus_init(void) { int rc; @@ -70,9 +380,14 @@ int __init nvdimm_bus_init(void) rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops); if (rc < 0) - goto err_chrdev; + goto err_bus_chrdev; nvdimm_bus_major = rc; + rc = register_chrdev(0, "dimmctl", &nvdimm_fops); + if (rc < 0) + goto err_dimm_chrdev; + nvdimm_major = rc; + nd_class = class_create(THIS_MODULE, "nd"); if (IS_ERR(nd_class)) goto err_class; @@ -80,8 +395,10 @@ int __init nvdimm_bus_init(void) return 0; err_class: + unregister_chrdev(nvdimm_major, "dimmctl"); + err_dimm_chrdev: unregister_chrdev(nvdimm_bus_major, "ndctl"); - err_chrdev: + err_bus_chrdev: bus_unregister(&nvdimm_bus_type); return rc; @@ -91,5 +408,6 @@ void __exit nvdimm_bus_exit(void) { class_destroy(nd_class); unregister_chrdev(nvdimm_bus_major, "ndctl"); + unregister_chrdev(nvdimm_major, "dimmctl"); bus_unregister(&nvdimm_bus_type); } diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index ef957eb37c90..1ce159095c52 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "nd-core.h" @@ -61,6 +62,20 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev) return NULL; } +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cmd, len = 0; + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus) { struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; @@ -84,6 +99,7 @@ static ssize_t provider_show(struct device *dev, static DEVICE_ATTR_RO(provider); static struct attribute *nvdimm_bus_attributes[] = { + &dev_attr_commands.attr, &dev_attr_provider.attr, NULL, }; diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 51ea52cc2079..c3dd7227d1bb 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -12,6 +12,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -33,7 +34,7 @@ static struct device_type nvdimm_device_type = { .release = nvdimm_release, }; -static bool is_nvdimm(struct device *dev) +bool is_nvdimm(struct device *dev) { return dev->type == &nvdimm_device_type; } @@ -55,12 +56,41 @@ EXPORT_SYMBOL_GPL(nvdimm_name); void *nvdimm_provider_data(struct nvdimm *nvdimm) { - return nvdimm->provider_data; + if (nvdimm) + return nvdimm->provider_data; + return NULL; } EXPORT_SYMBOL_GPL(nvdimm_provider_data); +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + int cmd, len = 0; + + if (!nvdimm->dsm_mask) + return sprintf(buf, "\n"); + + for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static struct attribute *nvdimm_attributes[] = { + &dev_attr_commands.attr, + NULL, +}; + +struct attribute_group nvdimm_attribute_group = { + .attrs = nvdimm_attributes, +}; +EXPORT_SYMBOL_GPL(nvdimm_attribute_group); + struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, - const struct attribute_group **groups, unsigned long flags) + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask) { struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL); struct device *dev; @@ -75,12 +105,14 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, } nvdimm->provider_data = provider_data; nvdimm->flags = flags; + nvdimm->dsm_mask = dsm_mask; dev = &nvdimm->dev; dev_set_name(dev, "nmem%d", nvdimm->id); dev->parent = &nvdimm_bus->dev; dev->type = &nvdimm_device_type; dev->bus = &nvdimm_bus_type; + dev->devt = MKDEV(nvdimm_major, nvdimm->id); dev->groups = groups; if (device_register(dev) != 0) { put_device(dev); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 9b8303413b60..59528b3c9de8 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -18,6 +18,7 @@ extern struct list_head nvdimm_bus_list; extern struct mutex nvdimm_bus_list_mutex; extern struct bus_type nvdimm_bus_type; +extern int nvdimm_major; struct nvdimm_bus { struct nvdimm_bus_descriptor *nd_desc; @@ -29,10 +30,12 @@ struct nvdimm_bus { struct nvdimm { unsigned long flags; void *provider_data; + unsigned long *dsm_mask; struct device dev; int id; }; +bool is_nvdimm(struct device *dev); struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void __exit nvdimm_bus_exit(void); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 07787f0dd7de..a39235819af3 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,13 +14,22 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include +#include enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + + /* need to set a limit somewhere, but yes, this is likely overkill */ + ND_IOCTL_MAX_BUFLEN = SZ_4M, + ND_CMD_MAX_ELEM = 4, + ND_CMD_MAX_ENVELOPE = 16, + ND_CMD_ARS_STATUS_MAX = SZ_4K, }; extern struct attribute_group nvdimm_bus_attribute_group; +extern struct attribute_group nvdimm_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -35,6 +44,14 @@ struct nvdimm_bus_descriptor { ndctl_fn ndctl; }; +struct nd_cmd_desc { + int in_num; + int out_num; + u32 in_sizes[ND_CMD_MAX_ELEM]; + int out_sizes[ND_CMD_MAX_ELEM]; +}; + +struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); @@ -45,5 +62,13 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, - const struct attribute_group **groups, unsigned long flags); + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask); +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf); +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 1a0006a76b00..200cc5ea2998 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -271,6 +271,7 @@ header-y += ncp_fs.h header-y += ncp.h header-y += ncp_mount.h header-y += ncp_no.h +header-y += ndctl.h header-y += neighbour.h header-y += netconf.h header-y += netdevice.h diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h new file mode 100644 index 000000000000..ff13c23b26df --- /dev/null +++ b/include/uapi/linux/ndctl.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU Lesser General Public License, + * version 2.1, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + */ +#ifndef __NDCTL_H__ +#define __NDCTL_H__ + +#include + +struct nd_cmd_smart { + __u32 status; + __u8 data[128]; +} __packed; + +struct nd_cmd_smart_threshold { + __u32 status; + __u8 data[8]; +} __packed; + +struct nd_cmd_dimm_flags { + __u32 status; + __u32 flags; +} __packed; + +struct nd_cmd_get_config_size { + __u32 status; + __u32 config_size; + __u32 max_xfer; +} __packed; + +struct nd_cmd_get_config_data_hdr { + __u32 in_offset; + __u32 in_length; + __u32 status; + __u8 out_buf[0]; +} __packed; + +struct nd_cmd_set_config_hdr { + __u32 in_offset; + __u32 in_length; + __u8 in_buf[0]; +} __packed; + +struct nd_cmd_vendor_hdr { + __u32 opcode; + __u32 in_length; + __u8 in_buf[0]; +} __packed; + +struct nd_cmd_vendor_tail { + __u32 status; + __u32 out_length; + __u8 out_buf[0]; +} __packed; + +struct nd_cmd_ars_cap { + __u64 address; + __u64 length; + __u32 status; + __u32 max_ars_out; +} __packed; + +struct nd_cmd_ars_start { + __u64 address; + __u64 length; + __u16 type; + __u8 reserved[6]; + __u32 status; +} __packed; + +struct nd_cmd_ars_status { + __u32 status; + __u32 out_length; + __u64 address; + __u64 length; + __u16 type; + __u32 num_records; + struct nd_ars_record { + __u32 handle; + __u32 flags; + __u64 err_address; + __u64 mask; + } __packed records[0]; +} __packed; + +enum { + ND_CMD_IMPLEMENTED = 0, + + /* bus commands */ + ND_CMD_ARS_CAP = 1, + ND_CMD_ARS_START = 2, + ND_CMD_ARS_STATUS = 3, + + /* per-dimm commands */ + ND_CMD_SMART = 1, + ND_CMD_SMART_THRESHOLD = 2, + ND_CMD_DIMM_FLAGS = 3, + ND_CMD_GET_CONFIG_SIZE = 4, + ND_CMD_GET_CONFIG_DATA = 5, + ND_CMD_SET_CONFIG_DATA = 6, + ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7, + ND_CMD_VENDOR_EFFECT_LOG = 8, + ND_CMD_VENDOR = 9, +}; + +static inline const char *nvdimm_bus_cmd_name(unsigned cmd) +{ + static const char * const names[] = { + [ND_CMD_ARS_CAP] = "ars_cap", + [ND_CMD_ARS_START] = "ars_start", + [ND_CMD_ARS_STATUS] = "ars_status", + }; + + if (cmd < ARRAY_SIZE(names) && names[cmd]) + return names[cmd]; + return "unknown"; +} + +static inline const char *nvdimm_cmd_name(unsigned cmd) +{ + static const char * const names[] = { + [ND_CMD_SMART] = "smart", + [ND_CMD_SMART_THRESHOLD] = "smart_thresh", + [ND_CMD_DIMM_FLAGS] = "flags", + [ND_CMD_GET_CONFIG_SIZE] = "get_size", + [ND_CMD_GET_CONFIG_DATA] = "get_data", + [ND_CMD_SET_CONFIG_DATA] = "set_data", + [ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size", + [ND_CMD_VENDOR_EFFECT_LOG] = "effect_log", + [ND_CMD_VENDOR] = "vendor", + }; + + if (cmd < ARRAY_SIZE(names) && names[cmd]) + return names[cmd]; + return "unknown"; +} + +#define ND_IOCTL 'N' + +#define ND_IOCTL_SMART _IOWR(ND_IOCTL, ND_CMD_SMART,\ + struct nd_cmd_smart) + +#define ND_IOCTL_SMART_THRESHOLD _IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\ + struct nd_cmd_smart_threshold) + +#define ND_IOCTL_DIMM_FLAGS _IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\ + struct nd_cmd_dimm_flags) + +#define ND_IOCTL_GET_CONFIG_SIZE _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_SIZE,\ + struct nd_cmd_get_config_size) + +#define ND_IOCTL_GET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_DATA,\ + struct nd_cmd_get_config_data_hdr) + +#define ND_IOCTL_SET_CONFIG_DATA _IOWR(ND_IOCTL, ND_CMD_SET_CONFIG_DATA,\ + struct nd_cmd_set_config_hdr) + +#define ND_IOCTL_VENDOR _IOWR(ND_IOCTL, ND_CMD_VENDOR,\ + struct nd_cmd_vendor_hdr) + +#define ND_IOCTL_ARS_CAP _IOWR(ND_IOCTL, ND_CMD_ARS_CAP,\ + struct nd_cmd_ars_cap) + +#define ND_IOCTL_ARS_START _IOWR(ND_IOCTL, ND_CMD_ARS_START,\ + struct nd_cmd_ars_start) + +#define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\ + struct nd_cmd_ars_status) + +#endif /* __NDCTL_H__ */ -- cgit v1.2.3 From 4d88a97aa9e8cfa6460aab119c5da60ad2267423 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 14:41:48 -0400 Subject: libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver infrastructure * Implement the device-model infrastructure for loading modules and attaching drivers to nvdimm devices. This is a simple association of a nd-device-type number with a driver that has a bitmask of supported device types. To facilitate userspace bind/unbind operations 'modalias' and 'devtype', that also appear in the uevent, are added as generic sysfs attributes for all nvdimm devices. The reason for the device-type number is to support sub-types within a given parent devtype, be it a vendor-specific sub-type or otherwise. * The first consumer of this infrastructure is the driver for dimm devices. It simply uses control messages to retrieve and store the configuration-data image (label set) from each dimm. Note: nd_device_register() arranges for asynchronous registration of nvdimm bus devices by default. Cc: Greg KH Cc: Neil Brown Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 13 +++- drivers/nvdimm/Makefile | 1 + drivers/nvdimm/bus.c | 168 ++++++++++++++++++++++++++++++++++++++++++++- drivers/nvdimm/core.c | 43 +++++++++++- drivers/nvdimm/dimm.c | 92 +++++++++++++++++++++++++ drivers/nvdimm/dimm_devs.c | 136 ++++++++++++++++++++++++++++++++++-- drivers/nvdimm/nd-core.h | 6 +- drivers/nvdimm/nd.h | 36 ++++++++++ include/linux/libnvdimm.h | 2 + include/linux/nd.h | 39 +++++++++++ include/uapi/linux/ndctl.h | 6 ++ 11 files changed, 527 insertions(+), 15 deletions(-) create mode 100644 drivers/nvdimm/dimm.c create mode 100644 drivers/nvdimm/nd.h create mode 100644 include/linux/nd.h (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 9112a6210a4b..c4ccec1bc60b 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -18,6 +18,10 @@ #include #include "nfit.h" +static bool force_enable_dimms; +module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); + static u8 nfit_uuid[NFIT_UUID_MAX][16]; static const u8 *to_nfit_uuid(enum nfit_uuids id) @@ -633,6 +637,7 @@ static struct attribute_group acpi_nfit_dimm_attribute_group = { static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = { &nvdimm_attribute_group, + &nd_device_attribute_group, &acpi_nfit_dimm_attribute_group, NULL, }; @@ -669,7 +674,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, if (!adev_dimm) { dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n", device_handle); - return -ENODEV; + return force_enable_dimms ? 0 : -ENODEV; } status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta); @@ -690,12 +695,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i)) set_bit(i, &nfit_mem->dsm_mask); - return rc; + return force_enable_dimms ? 0 : rc; } static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) { struct nfit_mem *nfit_mem; + int dimm_count = 0; list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) { struct nvdimm *nvdimm; @@ -729,9 +735,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) return -ENOMEM; nfit_mem->nvdimm = nvdimm; + dimm_count++; } - return 0; + return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); } static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 5b68738ba406..d44b5c1fcd3b 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o +libnvdimm-y += dimm.o diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 15f3a3ddc225..a0308f1872bf 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -16,19 +16,183 @@ #include #include #include +#include #include #include #include #include +#include #include "nd-core.h" +#include "nd.h" int nvdimm_major; static int nvdimm_bus_major; static struct class *nd_class; -struct bus_type nvdimm_bus_type = { +static int to_nd_device_type(struct device *dev) +{ + if (is_nvdimm(dev)) + return ND_DEVICE_DIMM; + + return 0; +} + +static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, + to_nd_device_type(dev)); +} + +static int nvdimm_bus_match(struct device *dev, struct device_driver *drv) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(drv); + + return test_bit(to_nd_device_type(dev), &nd_drv->type); +} + +static int nvdimm_bus_probe(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + rc = nd_drv->probe(dev); + dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + return rc; +} + +static int nvdimm_bus_remove(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + rc = nd_drv->remove(dev); + dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + return rc; +} + +static struct bus_type nvdimm_bus_type = { .name = "nd", + .uevent = nvdimm_bus_uevent, + .match = nvdimm_bus_match, + .probe = nvdimm_bus_probe, + .remove = nvdimm_bus_remove, +}; + +static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain); + +void nd_synchronize(void) +{ + async_synchronize_full_domain(&nd_async_domain); +} +EXPORT_SYMBOL_GPL(nd_synchronize); + +static void nd_async_device_register(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + if (device_add(dev) != 0) { + dev_err(dev, "%s: failed\n", __func__); + put_device(dev); + } + put_device(dev); +} + +static void nd_async_device_unregister(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + device_unregister(dev); + put_device(dev); +} + +void nd_device_register(struct device *dev) +{ + dev->bus = &nvdimm_bus_type; + device_initialize(dev); + get_device(dev); + async_schedule_domain(nd_async_device_register, dev, + &nd_async_domain); +} +EXPORT_SYMBOL(nd_device_register); + +void nd_device_unregister(struct device *dev, enum nd_async_mode mode) +{ + switch (mode) { + case ND_ASYNC: + get_device(dev); + async_schedule_domain(nd_async_device_unregister, dev, + &nd_async_domain); + break; + case ND_SYNC: + nd_synchronize(); + device_unregister(dev); + break; + } +} +EXPORT_SYMBOL(nd_device_unregister); + +/** + * __nd_driver_register() - register a region or a namespace driver + * @nd_drv: driver to register + * @owner: automatically set by nd_driver_register() macro + * @mod_name: automatically set by nd_driver_register() macro + */ +int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner, + const char *mod_name) +{ + struct device_driver *drv = &nd_drv->drv; + + if (!nd_drv->type) { + pr_debug("driver type bitmask not set (%pf)\n", + __builtin_return_address(0)); + return -EINVAL; + } + + if (!nd_drv->probe || !nd_drv->remove) { + pr_debug("->probe() and ->remove() must be specified\n"); + return -EINVAL; + } + + drv->bus = &nvdimm_bus_type; + drv->owner = owner; + drv->mod_name = mod_name; + + return driver_register(drv); +} +EXPORT_SYMBOL(__nd_driver_register); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n", + to_nd_device_type(dev)); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", dev->type->name); +} +static DEVICE_ATTR_RO(devtype); + +static struct attribute *nd_device_attributes[] = { + &dev_attr_modalias.attr, + &dev_attr_devtype.attr, + NULL, +}; + +/** + * nd_device_attribute_group - generic attributes for all devices on an nd bus + */ +struct attribute_group nd_device_attribute_group = { + .attrs = nd_device_attributes, }; +EXPORT_SYMBOL_GPL(nd_device_attribute_group); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) { @@ -404,7 +568,7 @@ int __init nvdimm_bus_init(void) return rc; } -void __exit nvdimm_bus_exit(void) +void nvdimm_bus_exit(void) { class_destroy(nd_class); unregister_chrdev(nvdimm_bus_major, "ndctl"); diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 1ce159095c52..50ab880f0dc0 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -18,6 +18,7 @@ #include #include #include "nd-core.h" +#include "nd.h" LIST_HEAD(nvdimm_bus_list); DEFINE_MUTEX(nvdimm_bus_list_mutex); @@ -98,8 +99,33 @@ static ssize_t provider_show(struct device *dev, } static DEVICE_ATTR_RO(provider); +static int flush_namespaces(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + return 0; +} + +static int flush_regions_dimms(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + device_for_each_child(dev, NULL, flush_namespaces); + return 0; +} + +static ssize_t wait_probe_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + nd_synchronize(); + device_for_each_child(dev, NULL, flush_regions_dimms); + return sprintf(buf, "1\n"); +} +static DEVICE_ATTR_RO(wait_probe); + static struct attribute *nvdimm_bus_attributes[] = { &dev_attr_commands.attr, + &dev_attr_wait_probe.attr, &dev_attr_provider.attr, NULL, }; @@ -161,7 +187,7 @@ static int child_unregister(struct device *dev, void *data) if (dev->class) /* pass */; else - device_unregister(dev); + nd_device_unregister(dev, ND_SYNC); return 0; } @@ -174,6 +200,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) list_del_init(&nvdimm_bus->list); mutex_unlock(&nvdimm_bus_list_mutex); + nd_synchronize(); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); nvdimm_bus_destroy_ndctl(nvdimm_bus); @@ -183,12 +210,24 @@ EXPORT_SYMBOL_GPL(nvdimm_bus_unregister); static __init int libnvdimm_init(void) { - return nvdimm_bus_init(); + int rc; + + rc = nvdimm_bus_init(); + if (rc) + return rc; + rc = nvdimm_init(); + if (rc) + goto err_dimm; + return 0; + err_dimm: + nvdimm_bus_exit(); + return rc; } static __exit void libnvdimm_exit(void) { WARN_ON(!list_empty(&nvdimm_bus_list)); + nvdimm_exit(); nvdimm_bus_exit(); } diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c new file mode 100644 index 000000000000..28001a6ccd4e --- /dev/null +++ b/drivers/nvdimm/dimm.c @@ -0,0 +1,92 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "nd.h" + +static void free_data(struct nvdimm_drvdata *ndd) +{ + if (!ndd) + return; + + if (ndd->data && is_vmalloc_addr(ndd->data)) + vfree(ndd->data); + else + kfree(ndd->data); + kfree(ndd); +} + +static int nvdimm_probe(struct device *dev) +{ + struct nvdimm_drvdata *ndd; + int rc; + + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); + if (!ndd) + return -ENOMEM; + + dev_set_drvdata(dev, ndd); + ndd->dev = dev; + + rc = nvdimm_init_nsarea(ndd); + if (rc) + goto err; + + rc = nvdimm_init_config_data(ndd); + if (rc) + goto err; + + dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size); + + return 0; + + err: + free_data(ndd); + return rc; +} + +static int nvdimm_remove(struct device *dev) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + + free_data(ndd); + + return 0; +} + +static struct nd_device_driver nvdimm_driver = { + .probe = nvdimm_probe, + .remove = nvdimm_remove, + .drv = { + .name = "nvdimm", + }, + .type = ND_DRIVER_DIMM, +}; + +int __init nvdimm_init(void) +{ + return nd_driver_register(&nvdimm_driver); +} + +void __exit nvdimm_exit(void) +{ + driver_unregister(&nvdimm_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index c3dd7227d1bb..b3ae86f2e1da 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -18,9 +19,115 @@ #include #include #include "nd-core.h" +#include "nd.h" static DEFINE_IDA(dimm_ida); +/* + * Retrieve bus and dimm handle and return if this bus supports + * get_config_data commands + */ +static int __validate_dimm(struct nvdimm_drvdata *ndd) +{ + struct nvdimm *nvdimm; + + if (!ndd) + return -EINVAL; + + nvdimm = to_nvdimm(ndd->dev); + + if (!nvdimm->dsm_mask) + return -ENXIO; + if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask)) + return -ENXIO; + + return 0; +} + +static int validate_dimm(struct nvdimm_drvdata *ndd) +{ + int rc = __validate_dimm(ndd); + + if (rc && ndd) + dev_dbg(ndd->dev, "%pf: %s error: %d\n", + __builtin_return_address(0), __func__, rc); + return rc; +} + +/** + * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area + * @nvdimm: dimm to initialize + */ +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd) +{ + struct nd_cmd_get_config_size *cmd = &ndd->nsarea; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + + if (rc) + return rc; + + if (cmd->config_size) + return 0; /* already valid */ + + memset(cmd, 0, sizeof(*cmd)); + nd_desc = nvdimm_bus->nd_desc; + return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd)); +} + +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nd_cmd_get_config_data_hdr *cmd; + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + u32 max_cmd_size, config_size; + size_t offset; + + if (rc) + return rc; + + if (ndd->data) + return 0; + + if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) + return -ENXIO; + + ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL); + if (!ndd->data) + ndd->data = vmalloc(ndd->nsarea.config_size); + + if (!ndd->data) + return -ENOMEM; + + max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer); + cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + nd_desc = nvdimm_bus->nd_desc; + for (config_size = ndd->nsarea.config_size, offset = 0; + config_size; config_size -= cmd->in_length, + offset += cmd->in_length) { + cmd->in_length = min(config_size, max_cmd_size); + cmd->in_offset = offset; + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_DATA, cmd, + cmd->in_length + sizeof(*cmd)); + if (rc || cmd->status) { + rc = -ENXIO; + break; + } + memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length); + } + dev_dbg(ndd->dev, "%s: len: %zu rc: %d\n", __func__, offset, rc); + kfree(cmd); + + return rc; +} + static void nvdimm_release(struct device *dev) { struct nvdimm *nvdimm = to_nvdimm(dev); @@ -111,14 +218,33 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, dev_set_name(dev, "nmem%d", nvdimm->id); dev->parent = &nvdimm_bus->dev; dev->type = &nvdimm_device_type; - dev->bus = &nvdimm_bus_type; dev->devt = MKDEV(nvdimm_major, nvdimm->id); dev->groups = groups; - if (device_register(dev) != 0) { - put_device(dev); - return NULL; - } + nd_device_register(dev); return nvdimm; } EXPORT_SYMBOL_GPL(nvdimm_create); + +static int count_dimms(struct device *dev, void *c) +{ + int *count = c; + + if (is_nvdimm(dev)) + (*count)++; + return 0; +} + +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count) +{ + int count = 0; + /* Flush any possible dimm registration failures */ + nd_synchronize(); + + device_for_each_child(&nvdimm_bus->dev, &count, count_dimms); + dev_dbg(&nvdimm_bus->dev, "%s: count: %d\n", __func__, count); + if (count != dimm_count) + return -ENXIO; + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count); diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 59528b3c9de8..f2004b790874 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -17,7 +17,6 @@ extern struct list_head nvdimm_bus_list; extern struct mutex nvdimm_bus_list_mutex; -extern struct bus_type nvdimm_bus_type; extern int nvdimm_major; struct nvdimm_bus { @@ -35,10 +34,11 @@ struct nvdimm { int id; }; -bool is_nvdimm(struct device *dev); struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); -void __exit nvdimm_bus_exit(void); +void nvdimm_bus_exit(void); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); +void nd_synchronize(void); +bool is_nvdimm(struct device *dev); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h new file mode 100644 index 000000000000..1f7f6ecab0fc --- /dev/null +++ b/drivers/nvdimm/nd.h @@ -0,0 +1,36 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ND_H__ +#define __ND_H__ +#include +#include +#include + +struct nvdimm_drvdata { + struct device *dev; + struct nd_cmd_get_config_size nsarea; + void *data; +}; + +enum nd_async_mode { + ND_SYNC, + ND_ASYNC, +}; + +void nd_device_register(struct device *dev); +void nd_device_unregister(struct device *dev, enum nd_async_mode mode); +int __init nvdimm_init(void); +void nvdimm_exit(void); +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); +#endif /* __ND_H__ */ diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a39235819af3..d3ebccf4ea8b 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -30,6 +30,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; +extern struct attribute_group nd_device_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -71,4 +72,5 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); #endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/nd.h b/include/linux/nd.h new file mode 100644 index 000000000000..e074f67e53a3 --- /dev/null +++ b/include/linux/nd.h @@ -0,0 +1,39 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LINUX_ND_H__ +#define __LINUX_ND_H__ +#include +#include + +struct nd_device_driver { + struct device_driver drv; + unsigned long type; + int (*probe)(struct device *dev); + int (*remove)(struct device *dev); +}; + +static inline struct nd_device_driver *to_nd_device_driver( + struct device_driver *drv) +{ + return container_of(drv, struct nd_device_driver, drv); +} + +#define MODULE_ALIAS_ND_DEVICE(type) \ + MODULE_ALIAS("nd:t" __stringify(type) "*") +#define ND_DEVICE_MODALIAS_FMT "nd:t%d" + +int __must_check __nd_driver_register(struct nd_device_driver *nd_drv, + struct module *module, const char *mod_name); +#define nd_driver_register(driver) \ + __nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) +#endif /* __LINUX_ND_H__ */ diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index ff13c23b26df..37640916d146 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -175,4 +175,10 @@ static inline const char *nvdimm_cmd_name(unsigned cmd) #define ND_IOCTL_ARS_STATUS _IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\ struct nd_cmd_ars_status) + +#define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */ + +enum nd_driver_flags { + ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM, +}; #endif /* __NDCTL_H__ */ -- cgit v1.2.3 From 1f7df6f88b9245a7f2d0f8ecbc97dc88c8d0d8e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 9 Jun 2015 20:13:14 -0400 Subject: libnvdimm, nfit: regions (block-data-window, persistent memory, volatile memory) A "region" device represents the maximum capacity of a BLK range (mmio block-data-window(s)), or a PMEM range (DAX-capable persistent memory or volatile memory), without regard for aliasing. Aliasing, in the dimm-local address space (DPA), is resolved by metadata on a dimm to designate which exclusive interface will access the aliased DPA ranges. Support for the per-dimm metadata/label arrvies is in a subsequent patch. The name format of "region" devices is "regionN" where, like dimms, N is a global ida index assigned at discovery time. This id is not reliable across reboots nor in the presence of hotplug. Look to attributes of the region or static id-data of the sub-namespace to generate a persistent name. However, if the platform configuration does not change it is reasonable to expect the same region id to be assigned at the next boot. "region"s have 2 generic attributes "size", and "mapping"s where: - size: the BLK accessible capacity or the span of the system physical address range in the case of PMEM. - mappingN: a tuple describing a dimm's contribution to the region's capacity in the format (,,). For a PMEM-region there will be at least one mapping per dimm in the interleave set. For a BLK-region there is only "mapping0" listing the starting DPA of the BLK-region and the available DPA capacity of that space (matches "size" above). The max number of mappings per "region" is hard coded per the constraints of sysfs attribute groups. That said the number of mappings per region should never exceed the maximum number of possible dimms in the system. If the current number turns out to not be enough then the "mappings" attribute clarifies how many there are supposed to be. "32 should be enough for anybody...". Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 148 ++++++++++++++++++++- drivers/nvdimm/Makefile | 1 + drivers/nvdimm/nd-core.h | 3 + drivers/nvdimm/nd.h | 11 ++ drivers/nvdimm/region_devs.c | 297 +++++++++++++++++++++++++++++++++++++++++++ include/linux/libnvdimm.h | 25 ++++ 6 files changed, 484 insertions(+), 1 deletion(-) create mode 100644 drivers/nvdimm/region_devs.c (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index c4ccec1bc60b..068f69d70c9e 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -757,11 +757,153 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc) set_bit(i, &nd_desc->dsm_mask); } +static ssize_t range_index_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region); + + return sprintf(buf, "%d\n", nfit_spa->spa->range_index); +} +static DEVICE_ATTR_RO(range_index); + +static struct attribute *acpi_nfit_region_attributes[] = { + &dev_attr_range_index.attr, + NULL, +}; + +static struct attribute_group acpi_nfit_region_attribute_group = { + .name = "nfit", + .attrs = acpi_nfit_region_attributes, +}; + +static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_mapping_attribute_group, + &acpi_nfit_region_attribute_group, + NULL, +}; + +static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, + struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, + struct acpi_nfit_memory_map *memdev, + struct acpi_nfit_system_address *spa) +{ + struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, + memdev->device_handle); + struct nfit_mem *nfit_mem; + int blk_valid = 0; + + if (!nvdimm) { + dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n", + spa->range_index, memdev->device_handle); + return -ENODEV; + } + + nd_mapping->nvdimm = nvdimm; + switch (nfit_spa_type(spa)) { + case NFIT_SPA_PM: + case NFIT_SPA_VOLATILE: + nd_mapping->start = memdev->address; + nd_mapping->size = memdev->region_size; + break; + case NFIT_SPA_DCR: + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem || !nfit_mem->bdw) { + dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n", + spa->range_index, nvdimm_name(nvdimm)); + } else { + nd_mapping->size = nfit_mem->bdw->capacity; + nd_mapping->start = nfit_mem->bdw->start_address; + blk_valid = 1; + } + + ndr_desc->nd_mapping = nd_mapping; + ndr_desc->num_mappings = blk_valid; + if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc)) + return -ENOMEM; + break; + } + + return 0; +} + +static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, + struct nfit_spa *nfit_spa) +{ + static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; + struct acpi_nfit_system_address *spa = nfit_spa->spa; + struct nfit_memdev *nfit_memdev; + struct nd_region_desc ndr_desc; + struct nvdimm_bus *nvdimm_bus; + struct resource res; + int count = 0; + + if (spa->range_index == 0) { + dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n", + __func__); + return 0; + } + + memset(&res, 0, sizeof(res)); + memset(&nd_mappings, 0, sizeof(nd_mappings)); + memset(&ndr_desc, 0, sizeof(ndr_desc)); + res.start = spa->address; + res.end = res.start + spa->length - 1; + ndr_desc.res = &res; + ndr_desc.provider_data = nfit_spa; + ndr_desc.attr_groups = acpi_nfit_region_attribute_groups; + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; + struct nd_mapping *nd_mapping; + int rc; + + if (memdev->range_index != spa->range_index) + continue; + if (count >= ND_MAX_MAPPINGS) { + dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n", + spa->range_index, ND_MAX_MAPPINGS); + return -ENXIO; + } + nd_mapping = &nd_mappings[count++]; + rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, &ndr_desc, + memdev, spa); + if (rc) + return rc; + } + + ndr_desc.nd_mapping = nd_mappings; + ndr_desc.num_mappings = count; + nvdimm_bus = acpi_desc->nvdimm_bus; + if (nfit_spa_type(spa) == NFIT_SPA_PM) { + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + return -ENOMEM; + } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { + if (!nvdimm_volatile_region_create(nvdimm_bus, &ndr_desc)) + return -ENOMEM; + } + return 0; +} + +static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) +{ + struct nfit_spa *nfit_spa; + + list_for_each_entry(nfit_spa, &acpi_desc->spas, list) { + int rc = acpi_nfit_register_region(acpi_desc, nfit_spa); + + if (rc) + return rc; + } + return 0; +} + static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) { struct device *dev = acpi_desc->dev; const void *end; u8 *data; + int rc; INIT_LIST_HEAD(&acpi_desc->spas); INIT_LIST_HEAD(&acpi_desc->dcrs); @@ -786,7 +928,11 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) acpi_nfit_init_dsms(acpi_desc); - return acpi_nfit_register_dimms(acpi_desc); + rc = acpi_nfit_register_dimms(acpi_desc); + if (rc) + return rc; + + return acpi_nfit_register_regions(acpi_desc); } static int acpi_nfit_add(struct acpi_device *adev) diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index d44b5c1fcd3b..88afd0d849c3 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -4,3 +4,4 @@ libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o libnvdimm-y += dimm.o +libnvdimm-y += region_devs.o diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index f2004b790874..1d760bf24857 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -40,5 +40,8 @@ void nvdimm_bus_exit(void); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); void nd_synchronize(void); +int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); +int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); +int nd_match_dimm(struct device *dev, void *data); bool is_nvdimm(struct device *dev); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 1f7f6ecab0fc..ea0cca337aa6 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -12,6 +12,7 @@ */ #ifndef __ND_H__ #define __ND_H__ +#include #include #include #include @@ -22,6 +23,16 @@ struct nvdimm_drvdata { void *data; }; +struct nd_region { + struct device dev; + u16 ndr_mappings; + u64 ndr_size; + u64 ndr_start; + int id; + void *provider_data; + struct nd_mapping mapping[0]; +}; + enum nd_async_mode { ND_SYNC, ND_ASYNC, diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c new file mode 100644 index 000000000000..4bda2e0df8f7 --- /dev/null +++ b/drivers/nvdimm/region_devs.c @@ -0,0 +1,297 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include "nd-core.h" +#include "nd.h" + +static DEFINE_IDA(region_ida); + +static void nd_region_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + u16 i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + put_device(&nvdimm->dev); + } + ida_simple_remove(®ion_ida, nd_region->id); + kfree(nd_region); +} + +static struct device_type nd_blk_device_type = { + .name = "nd_blk", + .release = nd_region_release, +}; + +static struct device_type nd_pmem_device_type = { + .name = "nd_pmem", + .release = nd_region_release, +}; + +static struct device_type nd_volatile_device_type = { + .name = "nd_volatile", + .release = nd_region_release, +}; + +static bool is_nd_pmem(struct device *dev) +{ + return dev ? dev->type == &nd_pmem_device_type : false; +} + +struct nd_region *to_nd_region(struct device *dev) +{ + struct nd_region *nd_region = container_of(dev, struct nd_region, dev); + + WARN_ON(dev->type->release != nd_region_release); + return nd_region; +} +EXPORT_SYMBOL_GPL(to_nd_region); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long size = 0; + + if (is_nd_pmem(dev)) { + size = nd_region->ndr_size; + } else if (nd_region->ndr_mappings == 1) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + size = nd_mapping->size; + } + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(size); + +static ssize_t mappings_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ndr_mappings); +} +static DEVICE_ATTR_RO(mappings); + +static struct attribute *nd_region_attributes[] = { + &dev_attr_size.attr, + &dev_attr_mappings.attr, + NULL, +}; + +struct attribute_group nd_region_attribute_group = { + .attrs = nd_region_attributes, +}; +EXPORT_SYMBOL_GPL(nd_region_attribute_group); + +static ssize_t mappingN(struct device *dev, char *buf, int n) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_mapping *nd_mapping; + struct nvdimm *nvdimm; + + if (n >= nd_region->ndr_mappings) + return -ENXIO; + nd_mapping = &nd_region->mapping[n]; + nvdimm = nd_mapping->nvdimm; + + return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev), + nd_mapping->start, nd_mapping->size); +} + +#define REGION_MAPPING(idx) \ +static ssize_t mapping##idx##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return mappingN(dev, buf, idx); \ +} \ +static DEVICE_ATTR_RO(mapping##idx) + +/* + * 32 should be enough for a while, even in the presence of socket + * interleave a 32-way interleave set is a degenerate case. + */ +REGION_MAPPING(0); +REGION_MAPPING(1); +REGION_MAPPING(2); +REGION_MAPPING(3); +REGION_MAPPING(4); +REGION_MAPPING(5); +REGION_MAPPING(6); +REGION_MAPPING(7); +REGION_MAPPING(8); +REGION_MAPPING(9); +REGION_MAPPING(10); +REGION_MAPPING(11); +REGION_MAPPING(12); +REGION_MAPPING(13); +REGION_MAPPING(14); +REGION_MAPPING(15); +REGION_MAPPING(16); +REGION_MAPPING(17); +REGION_MAPPING(18); +REGION_MAPPING(19); +REGION_MAPPING(20); +REGION_MAPPING(21); +REGION_MAPPING(22); +REGION_MAPPING(23); +REGION_MAPPING(24); +REGION_MAPPING(25); +REGION_MAPPING(26); +REGION_MAPPING(27); +REGION_MAPPING(28); +REGION_MAPPING(29); +REGION_MAPPING(30); +REGION_MAPPING(31); + +static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nd_region *nd_region = to_nd_region(dev); + + if (n < nd_region->ndr_mappings) + return a->mode; + return 0; +} + +static struct attribute *mapping_attributes[] = { + &dev_attr_mapping0.attr, + &dev_attr_mapping1.attr, + &dev_attr_mapping2.attr, + &dev_attr_mapping3.attr, + &dev_attr_mapping4.attr, + &dev_attr_mapping5.attr, + &dev_attr_mapping6.attr, + &dev_attr_mapping7.attr, + &dev_attr_mapping8.attr, + &dev_attr_mapping9.attr, + &dev_attr_mapping10.attr, + &dev_attr_mapping11.attr, + &dev_attr_mapping12.attr, + &dev_attr_mapping13.attr, + &dev_attr_mapping14.attr, + &dev_attr_mapping15.attr, + &dev_attr_mapping16.attr, + &dev_attr_mapping17.attr, + &dev_attr_mapping18.attr, + &dev_attr_mapping19.attr, + &dev_attr_mapping20.attr, + &dev_attr_mapping21.attr, + &dev_attr_mapping22.attr, + &dev_attr_mapping23.attr, + &dev_attr_mapping24.attr, + &dev_attr_mapping25.attr, + &dev_attr_mapping26.attr, + &dev_attr_mapping27.attr, + &dev_attr_mapping28.attr, + &dev_attr_mapping29.attr, + &dev_attr_mapping30.attr, + &dev_attr_mapping31.attr, + NULL, +}; + +struct attribute_group nd_mapping_attribute_group = { + .is_visible = mapping_visible, + .attrs = mapping_attributes, +}; +EXPORT_SYMBOL_GPL(nd_mapping_attribute_group); + +void *nd_region_provider_data(struct nd_region *nd_region) +{ + return nd_region->provider_data; +} +EXPORT_SYMBOL_GPL(nd_region_provider_data); + +static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc, struct device_type *dev_type, + const char *caller) +{ + struct nd_region *nd_region; + struct device *dev; + u16 i; + + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if ((nd_mapping->start | nd_mapping->size) % SZ_4K) { + dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n", + caller, dev_name(&nvdimm->dev), i); + + return NULL; + } + } + + nd_region = kzalloc(sizeof(struct nd_region) + + sizeof(struct nd_mapping) * ndr_desc->num_mappings, + GFP_KERNEL); + if (!nd_region) + return NULL; + nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); + if (nd_region->id < 0) { + kfree(nd_region); + return NULL; + } + + memcpy(nd_region->mapping, ndr_desc->nd_mapping, + sizeof(struct nd_mapping) * ndr_desc->num_mappings); + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + get_device(&nvdimm->dev); + } + nd_region->ndr_mappings = ndr_desc->num_mappings; + nd_region->provider_data = ndr_desc->provider_data; + dev = &nd_region->dev; + dev_set_name(dev, "region%d", nd_region->id); + dev->parent = &nvdimm_bus->dev; + dev->type = dev_type; + dev->groups = ndr_desc->attr_groups; + nd_region->ndr_size = resource_size(ndr_desc->res); + nd_region->ndr_start = ndr_desc->res->start; + nd_device_register(dev); + + return nd_region; +} + +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create); + +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + if (ndr_desc->num_mappings > 1) + return NULL; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_blk_region_create); + +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index d3ebccf4ea8b..39e7e606092a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -26,11 +26,14 @@ enum { ND_CMD_MAX_ELEM = 4, ND_CMD_MAX_ENVELOPE = 16, ND_CMD_ARS_STATUS_MAX = SZ_4K, + ND_MAX_MAPPINGS = 32, }; extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_region_attribute_group; +extern struct attribute_group nd_mapping_attribute_group; struct nvdimm; struct nvdimm_bus_descriptor; @@ -38,6 +41,12 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, unsigned int cmd, void *buf, unsigned int buf_len); +struct nd_mapping { + struct nvdimm *nvdimm; + u64 start; + u64 size; +}; + struct nvdimm_bus_descriptor { const struct attribute_group **attr_groups; unsigned long dsm_mask; @@ -52,6 +61,14 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_region_desc { + struct resource *res; + struct nd_mapping *nd_mapping; + u16 num_mappings; + const struct attribute_group **attr_groups; + void *provider_data; +}; + struct nvdimm_bus; struct device; struct nvdimm_bus *nvdimm_bus_register(struct device *parent, @@ -59,9 +76,11 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); +struct nd_region *to_nd_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); +void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -73,4 +92,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, const struct nd_cmd_desc *desc, int idx, const u32 *in_field, const u32 *out_field); int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count); +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 3d88002e4a7bd40f355550284c6cd140e6fe29dc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 May 2015 15:02:11 -0400 Subject: libnvdimm: support for legacy (non-aliasing) nvdimms The libnvdimm region driver is an intermediary driver that translates non-volatile "region"s into "namespace" sub-devices that are surfaced by persistent memory block-device drivers (PMEM and BLK). ACPI 6 introduces the concept that a given nvdimm may simultaneously offer multiple access modes to its media through direct PMEM load/store access, or windowed BLK mode. Existing nvdimms mostly implement a PMEM interface, some offer a BLK-like mode, but never both as ACPI 6 defines. If an nvdimm is single interfaced, then there is no need for dimm metadata labels. For these devices we can take the region boundaries directly to create a child namespace device (nd_namespace_io). Acked-by: Christoph Hellwig Tested-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 1 + drivers/nvdimm/Makefile | 2 + drivers/nvdimm/bus.c | 26 ++++++++++ drivers/nvdimm/core.c | 44 ++++++++++++++-- drivers/nvdimm/dimm.c | 2 +- drivers/nvdimm/namespace_devs.c | 111 ++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/nd-core.h | 6 ++- drivers/nvdimm/nd.h | 13 +++++ drivers/nvdimm/region.c | 93 +++++++++++++++++++++++++++++++++ drivers/nvdimm/region_devs.c | 66 +++++++++++++++++++++++- include/linux/libnvdimm.h | 7 ++- include/linux/nd.h | 10 ++++ include/uapi/linux/ndctl.h | 10 ++++ 13 files changed, 383 insertions(+), 8 deletions(-) create mode 100644 drivers/nvdimm/namespace_devs.c create mode 100644 drivers/nvdimm/region.c (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 068f69d70c9e..ce290748fe36 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -780,6 +780,7 @@ static struct attribute_group acpi_nfit_region_attribute_group = { static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { &nd_region_attribute_group, &nd_mapping_attribute_group, + &nd_device_attribute_group, &acpi_nfit_region_attribute_group, NULL, }; diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 88afd0d849c3..af5e2760ddbd 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -5,3 +5,5 @@ libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o libnvdimm-y += dimm.o libnvdimm-y += region_devs.o +libnvdimm-y += region.o +libnvdimm-y += namespace_devs.o diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index a0308f1872bf..4b77665a6cc8 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include +#include #include #include #include @@ -33,6 +34,12 @@ static int to_nd_device_type(struct device *dev) { if (is_nvdimm(dev)) return ND_DEVICE_DIMM; + else if (is_nd_pmem(dev)) + return ND_DEVICE_REGION_PMEM; + else if (is_nd_blk(dev)) + return ND_DEVICE_REGION_BLK; + else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent)) + return nd_region_to_nstype(to_nd_region(dev->parent)); return 0; } @@ -50,27 +57,46 @@ static int nvdimm_bus_match(struct device *dev, struct device_driver *drv) return test_bit(to_nd_device_type(dev), &nd_drv->type); } +static struct module *to_bus_provider(struct device *dev) +{ + /* pin bus providers while regions are enabled */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + return nvdimm_bus->module; + } + return NULL; +} + static int nvdimm_bus_probe(struct device *dev) { struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); int rc; + if (!try_module_get(provider)) + return -ENXIO; + rc = nd_drv->probe(dev); dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, dev_name(dev), rc); + if (rc != 0) + module_put(provider); return rc; } static int nvdimm_bus_remove(struct device *dev) { struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); int rc; rc = nd_drv->remove(dev); dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name, dev_name(dev), rc); + module_put(provider); return rc; } diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 50ab880f0dc0..1b6b15d11f54 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -24,6 +24,36 @@ LIST_HEAD(nvdimm_bus_list); DEFINE_MUTEX(nvdimm_bus_list_mutex); static DEFINE_IDA(nd_ida); +void nvdimm_bus_lock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_lock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_lock); + +void nvdimm_bus_unlock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_unlock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_unlock); + +bool is_nvdimm_bus_locked(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + return mutex_is_locked(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(is_nvdimm_bus_locked); + static void nvdimm_bus_release(struct device *dev) { struct nvdimm_bus *nvdimm_bus; @@ -135,8 +165,8 @@ struct attribute_group nvdimm_bus_attribute_group = { }; EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); -struct nvdimm_bus *nvdimm_bus_register(struct device *parent, - struct nvdimm_bus_descriptor *nd_desc) +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nd_desc, struct module *module) { struct nvdimm_bus *nvdimm_bus; int rc; @@ -146,11 +176,13 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); + mutex_init(&nvdimm_bus->reconfig_mutex); if (nvdimm_bus->id < 0) { kfree(nvdimm_bus); return NULL; } nvdimm_bus->nd_desc = nd_desc; + nvdimm_bus->module = module; nvdimm_bus->dev.parent = parent; nvdimm_bus->dev.release = nvdimm_bus_release; nvdimm_bus->dev.groups = nd_desc->attr_groups; @@ -174,7 +206,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent, put_device(&nvdimm_bus->dev); return NULL; } -EXPORT_SYMBOL_GPL(nvdimm_bus_register); +EXPORT_SYMBOL_GPL(__nvdimm_bus_register); static int child_unregister(struct device *dev, void *data) { @@ -218,7 +250,12 @@ static __init int libnvdimm_init(void) rc = nvdimm_init(); if (rc) goto err_dimm; + rc = nd_region_init(); + if (rc) + goto err_region; return 0; + err_region: + nvdimm_exit(); err_dimm: nvdimm_bus_exit(); return rc; @@ -227,6 +264,7 @@ static __init int libnvdimm_init(void) static __exit void libnvdimm_exit(void) { WARN_ON(!list_empty(&nvdimm_bus_list)); + nd_region_exit(); nvdimm_exit(); nvdimm_bus_exit(); } diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c index 28001a6ccd4e..eb20fc2df32b 100644 --- a/drivers/nvdimm/dimm.c +++ b/drivers/nvdimm/dimm.c @@ -84,7 +84,7 @@ int __init nvdimm_init(void) return nd_driver_register(&nvdimm_driver); } -void __exit nvdimm_exit(void) +void nvdimm_exit(void) { driver_unregister(&nvdimm_driver.drv); } diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c new file mode 100644 index 000000000000..4f653d1e61ad --- /dev/null +++ b/drivers/nvdimm/namespace_devs.c @@ -0,0 +1,111 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include "nd.h" + +static void namespace_io_release(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + kfree(nsio); +} + +static struct device_type namespace_io_device_type = { + .name = "nd_namespace_io", + .release = namespace_io_release, +}; + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static struct attribute *nd_namespace_attributes[] = { + &dev_attr_nstype.attr, + NULL, +}; + +static struct attribute_group nd_namespace_attribute_group = { + .attrs = nd_namespace_attributes, +}; + +static const struct attribute_group *nd_namespace_attribute_groups[] = { + &nd_device_attribute_group, + &nd_namespace_attribute_group, + NULL, +}; + +static struct device **create_namespace_io(struct nd_region *nd_region) +{ + struct nd_namespace_io *nsio; + struct device *dev, **devs; + struct resource *res; + + nsio = kzalloc(sizeof(*nsio), GFP_KERNEL); + if (!nsio) + return NULL; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) { + kfree(nsio); + return NULL; + } + + dev = &nsio->dev; + dev->type = &namespace_io_device_type; + dev->parent = &nd_region->dev; + res = &nsio->res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + res->start = nd_region->ndr_start; + res->end = res->start + nd_region->ndr_size - 1; + + devs[0] = dev; + return devs; +} + +int nd_region_register_namespaces(struct nd_region *nd_region, int *err) +{ + struct device **devs = NULL; + int i; + + *err = 0; + switch (nd_region_to_nstype(nd_region)) { + case ND_DEVICE_NAMESPACE_IO: + devs = create_namespace_io(nd_region); + break; + default: + break; + } + + if (!devs) + return -ENODEV; + + for (i = 0; devs[i]; i++) { + struct device *dev = devs[i]; + + dev_set_name(dev, "namespace%d.%d", nd_region->id, i); + dev->groups = nd_namespace_attribute_groups; + nd_device_register(dev); + } + kfree(devs); + + return i; +} diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 1d760bf24857..0e9b41fd2546 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -21,9 +21,11 @@ extern int nvdimm_major; struct nvdimm_bus { struct nvdimm_bus_descriptor *nd_desc; + struct module *module; struct list_head list; struct device dev; int id; + struct mutex reconfig_mutex; }; struct nvdimm { @@ -34,6 +36,9 @@ struct nvdimm { int id; }; +bool is_nvdimm(struct device *dev); +bool is_nd_blk(struct device *dev); +bool is_nd_pmem(struct device *dev); struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); @@ -43,5 +48,4 @@ void nd_synchronize(void); int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); int nd_match_dimm(struct device *dev, void *data); -bool is_nvdimm(struct device *dev); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ea0cca337aa6..bc5a08e36a25 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -23,6 +23,11 @@ struct nvdimm_drvdata { void *data; }; +struct nd_region_namespaces { + int count; + int active; +}; + struct nd_region { struct device dev; u16 ndr_mappings; @@ -41,7 +46,15 @@ enum nd_async_mode { void nd_device_register(struct device *dev); void nd_device_unregister(struct device *dev, enum nd_async_mode mode); int __init nvdimm_init(void); +int __init nd_region_init(void); void nvdimm_exit(void); +void nd_region_exit(void); int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); +struct nd_region *to_nd_region(struct device *dev); +int nd_region_to_nstype(struct nd_region *nd_region); +int nd_region_register_namespaces(struct nd_region *nd_region, int *err); +void nvdimm_bus_lock(struct device *dev); +void nvdimm_bus_unlock(struct device *dev); +bool is_nvdimm_bus_locked(struct device *dev); #endif /* __ND_H__ */ diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c new file mode 100644 index 000000000000..ade3dba81afd --- /dev/null +++ b/drivers/nvdimm/region.c @@ -0,0 +1,93 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include "nd.h" + +static int nd_region_probe(struct device *dev) +{ + int err; + struct nd_region_namespaces *num_ns; + struct nd_region *nd_region = to_nd_region(dev); + int rc = nd_region_register_namespaces(nd_region, &err); + + num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); + if (!num_ns) + return -ENOMEM; + + if (rc < 0) + return rc; + + num_ns->active = rc; + num_ns->count = rc + err; + dev_set_drvdata(dev, num_ns); + + if (err == 0) + return 0; + + if (rc == err) + return -ENODEV; + + /* + * Given multiple namespaces per region, we do not want to + * disable all the successfully registered peer namespaces upon + * a single registration failure. If userspace is missing a + * namespace that it expects it can disable/re-enable the region + * to retry discovery after correcting the failure. + * /namespaces returns the current + * "/" namespace count. + */ + dev_err(dev, "failed to register %d namespace%s, continuing...\n", + err, err == 1 ? "" : "s"); + return 0; +} + +static int child_unregister(struct device *dev, void *data) +{ + nd_device_unregister(dev, ND_SYNC); + return 0; +} + +static int nd_region_remove(struct device *dev) +{ + /* flush attribute readers and disable */ + nvdimm_bus_lock(dev); + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + + device_for_each_child(dev, NULL, child_unregister); + return 0; +} + +static struct nd_device_driver nd_region_driver = { + .probe = nd_region_probe, + .remove = nd_region_remove, + .drv = { + .name = "nd_region", + }, + .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM, +}; + +int __init nd_region_init(void) +{ + return nd_driver_register(&nd_region_driver); +} + +void nd_region_exit(void) +{ + driver_unregister(&nd_region_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4bda2e0df8f7..b5c5b9095b28 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -47,11 +47,16 @@ static struct device_type nd_volatile_device_type = { .release = nd_region_release, }; -static bool is_nd_pmem(struct device *dev) +bool is_nd_pmem(struct device *dev) { return dev ? dev->type == &nd_pmem_device_type : false; } +bool is_nd_blk(struct device *dev) +{ + return dev ? dev->type == &nd_blk_device_type : false; +} + struct nd_region *to_nd_region(struct device *dev) { struct nd_region *nd_region = container_of(dev, struct nd_region, dev); @@ -61,6 +66,37 @@ struct nd_region *to_nd_region(struct device *dev) } EXPORT_SYMBOL_GPL(to_nd_region); +/** + * nd_region_to_nstype() - region to an integer namespace type + * @nd_region: region-device to interrogate + * + * This is the 'nstype' attribute of a region as well, an input to the + * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match + * namespace devices with namespace drivers. + */ +int nd_region_to_nstype(struct nd_region *nd_region) +{ + if (is_nd_pmem(&nd_region->dev)) { + u16 i, alias; + + for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (nvdimm->flags & NDD_ALIASING) + alias++; + } + if (alias) + return ND_DEVICE_NAMESPACE_PMEM; + else + return ND_DEVICE_NAMESPACE_IO; + } else if (is_nd_blk(&nd_region->dev)) { + return ND_DEVICE_NAMESPACE_BLK; + } + + return 0; +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -88,9 +124,37 @@ static ssize_t mappings_show(struct device *dev, } static DEVICE_ATTR_RO(mappings); +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t init_namespaces_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region_namespaces *num_ns = dev_get_drvdata(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (num_ns) + rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count); + else + rc = -ENXIO; + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(init_namespaces); + static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, + &dev_attr_nstype.attr, &dev_attr_mappings.attr, + &dev_attr_init_namespaces.attr, NULL, }; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 39e7e606092a..37f966aff386 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -71,8 +71,11 @@ struct nd_region_desc { struct nvdimm_bus; struct device; -struct nvdimm_bus *nvdimm_bus_register(struct device *parent, - struct nvdimm_bus_descriptor *nfit_desc); +struct module; +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nfit_desc, struct module *module); +#define nvdimm_bus_register(parent, desc) \ + __nvdimm_bus_register(parent, desc, THIS_MODULE) void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); diff --git a/include/linux/nd.h b/include/linux/nd.h index e074f67e53a3..da70e9962197 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -26,6 +26,16 @@ static inline struct nd_device_driver *to_nd_device_driver( struct device_driver *drv) { return container_of(drv, struct nd_device_driver, drv); +}; + +struct nd_namespace_io { + struct device dev; + struct resource res; +}; + +static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev) +{ + return container_of(dev, struct nd_namespace_io, dev); } #define MODULE_ALIAS_ND_DEVICE(type) \ diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index 37640916d146..174b6371dcc1 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -177,8 +177,18 @@ static inline const char *nvdimm_cmd_name(unsigned cmd) #define ND_DEVICE_DIMM 1 /* nd_dimm: container for "config data" */ +#define ND_DEVICE_REGION_PMEM 2 /* nd_region: (parent of PMEM namespaces) */ +#define ND_DEVICE_REGION_BLK 3 /* nd_region: (parent of BLK namespaces) */ +#define ND_DEVICE_NAMESPACE_IO 4 /* legacy persistent memory */ +#define ND_DEVICE_NAMESPACE_PMEM 5 /* PMEM namespace (may alias with BLK) */ +#define ND_DEVICE_NAMESPACE_BLK 6 /* BLK namespace (may alias with PMEM) */ enum nd_driver_flags { ND_DRIVER_DIMM = 1 << ND_DEVICE_DIMM, + ND_DRIVER_REGION_PMEM = 1 << ND_DEVICE_REGION_PMEM, + ND_DRIVER_REGION_BLK = 1 << ND_DEVICE_REGION_BLK, + ND_DRIVER_NAMESPACE_IO = 1 << ND_DEVICE_NAMESPACE_IO, + ND_DRIVER_NAMESPACE_PMEM = 1 << ND_DEVICE_NAMESPACE_PMEM, + ND_DRIVER_NAMESPACE_BLK = 1 << ND_DEVICE_NAMESPACE_BLK, }; #endif /* __NDCTL_H__ */ -- cgit v1.2.3 From eaf961536e1622ad21247ac8d44acd48ba65566e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 1 May 2015 13:11:27 -0400 Subject: libnvdimm, nfit: add interleave-set state-tracking infrastructure On platforms that have firmware support for reading/writing per-dimm label space, a portion of the dimm may be accessible via an interleave set PMEM mapping in addition to the dimm's BLK (block-data-window aperture(s)) interface. A label, stored in a "configuration data region" on the dimm, disambiguates which dimm addresses are accessed through which exclusive interface. Add infrastructure that allows the kernel to block modifications to a label in the set while any member dimm is active. Note that this is meant only for enforcing "no modifications of active labels" via the coarse ioctl command. Adding/deleting namespaces from an active interleave set is always possible via sysfs. Another aspect of tracking interleave sets is tracking their integrity when DIMMs in a set are physically re-ordered. For this purpose we generate an "interleave-set cookie" that can be recorded in a label and validated against the current configuration. It is the bus provider implementation's responsibility to calculate the interleave set cookie and attach it to a given region. Cc: Neil Brown Cc: Cc: Greg KH Cc: Robert Moore Cc: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 93 +++++++++++++++++++++++++++++++++++++++++++- drivers/nvdimm/bus.c | 59 +++++++++++++++++++++++++++- drivers/nvdimm/core.c | 17 ++++++++ drivers/nvdimm/dimm_devs.c | 19 ++++++++- drivers/nvdimm/nd-core.h | 10 ++++- drivers/nvdimm/nd.h | 1 + drivers/nvdimm/region_devs.c | 69 ++++++++++++++++++++++++++++++++ include/linux/libnvdimm.h | 6 +++ 8 files changed, 269 insertions(+), 5 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index ce290748fe36..35af6f7f0abd 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "nfit.h" static bool force_enable_dimms; @@ -785,6 +786,91 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { NULL, }; +/* enough info to uniquely specify an interleave set */ +struct nfit_set_info { + struct nfit_set_info_map { + u64 region_offset; + u32 serial_number; + u32 pad; + } mapping[0]; +}; + +static size_t sizeof_nfit_set_info(int num_mappings) +{ + return sizeof(struct nfit_set_info) + + num_mappings * sizeof(struct nfit_set_info_map); +} + +static int cmp_map(const void *m0, const void *m1) +{ + const struct nfit_set_info_map *map0 = m0; + const struct nfit_set_info_map *map1 = m1; + + return memcmp(&map0->region_offset, &map1->region_offset, + sizeof(u64)); +} + +/* Retrieve the nth entry referencing this spa */ +static struct acpi_nfit_memory_map *memdev_from_spa( + struct acpi_nfit_desc *acpi_desc, u16 range_index, int n) +{ + struct nfit_memdev *nfit_memdev; + + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) + if (nfit_memdev->memdev->range_index == range_index) + if (n-- == 0) + return nfit_memdev->memdev; + return NULL; +} + +static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, + struct nd_region_desc *ndr_desc, + struct acpi_nfit_system_address *spa) +{ + int i, spa_type = nfit_spa_type(spa); + struct device *dev = acpi_desc->dev; + struct nd_interleave_set *nd_set; + u16 nr = ndr_desc->num_mappings; + struct nfit_set_info *info; + + if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE) + /* pass */; + else + return 0; + + nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); + if (!nd_set) + return -ENOMEM; + + info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL); + if (!info) + return -ENOMEM; + for (i = 0; i < nr; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nfit_set_info_map *map = &info->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, + spa->range_index, i); + + if (!memdev || !nfit_mem->dcr) { + dev_err(dev, "%s: failed to find DCR\n", __func__); + return -ENODEV; + } + + map->region_offset = memdev->region_offset; + map->serial_number = nfit_mem->dcr->serial_number; + } + + sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map), + cmp_map, NULL); + nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0); + ndr_desc->nd_set = nd_set; + devm_kfree(dev, info); + + return 0; +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -838,7 +924,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct nd_region_desc ndr_desc; struct nvdimm_bus *nvdimm_bus; struct resource res; - int count = 0; + int count = 0, rc; if (spa->range_index == 0) { dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n", @@ -857,7 +943,6 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping *nd_mapping; - int rc; if (memdev->range_index != spa->range_index) continue; @@ -875,6 +960,10 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc.nd_mapping = nd_mappings; ndr_desc.num_mappings = count; + rc = acpi_nfit_init_interleave_set(acpi_desc, &ndr_desc, spa); + if (rc) + return rc; + nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 4b77665a6cc8..ffb43cada625 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -68,6 +68,21 @@ static struct module *to_bus_provider(struct device *dev) return NULL; } +static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + nvdimm_bus->probe_active++; + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + if (--nvdimm_bus->probe_active == 0) + wake_up(&nvdimm_bus->probe_wait); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + static int nvdimm_bus_probe(struct device *dev) { struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); @@ -78,7 +93,12 @@ static int nvdimm_bus_probe(struct device *dev) if (!try_module_get(provider)) return -ENXIO; + nvdimm_bus_probe_start(nvdimm_bus); rc = nd_drv->probe(dev); + if (rc == 0) + nd_region_probe_success(nvdimm_bus, dev); + nvdimm_bus_probe_end(nvdimm_bus); + dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, dev_name(dev), rc); if (rc != 0) @@ -94,6 +114,8 @@ static int nvdimm_bus_remove(struct device *dev) int rc; rc = nd_drv->remove(dev); + nd_region_disable(nvdimm_bus, dev); + dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name, dev_name(dev), rc); module_put(provider); @@ -359,6 +381,34 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, } EXPORT_SYMBOL_GPL(nd_cmd_out_size); +static void wait_nvdimm_bus_probe_idle(struct nvdimm_bus *nvdimm_bus) +{ + do { + if (nvdimm_bus->probe_active == 0) + break; + nvdimm_bus_unlock(&nvdimm_bus->dev); + wait_event(nvdimm_bus->probe_wait, + nvdimm_bus->probe_active == 0); + nvdimm_bus_lock(&nvdimm_bus->dev); + } while (true); +} + +/* set_config requires an idle interleave set */ +static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd) +{ + struct nvdimm_bus *nvdimm_bus; + + if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) + return 0; + + nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev); + wait_nvdimm_bus_probe_idle(nvdimm_bus); + + if (atomic_read(&nvdimm->busy)) + return -EBUSY; + return 0; +} + static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, int read_only, unsigned int ioctl_cmd, unsigned long arg) { @@ -469,11 +519,18 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, goto out; } + nvdimm_bus_lock(&nvdimm_bus->dev); + rc = nd_cmd_clear_to_send(nvdimm, cmd); + if (rc) + goto out_unlock; + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len); if (rc < 0) - goto out; + goto out_unlock; if (copy_to_user(p, buf, buf_len)) rc = -EFAULT; + out_unlock: + nvdimm_bus_unlock(&nvdimm_bus->dev); out: vfree(buf); return rc; diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 1b6b15d11f54..7806eaaf4707 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -54,6 +54,22 @@ bool is_nvdimm_bus_locked(struct device *dev) } EXPORT_SYMBOL(is_nvdimm_bus_locked); +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} +EXPORT_SYMBOL_GPL(nd_fletcher64); + static void nvdimm_bus_release(struct device *dev) { struct nvdimm_bus *nvdimm_bus; @@ -175,6 +191,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, if (!nvdimm_bus) return NULL; INIT_LIST_HEAD(&nvdimm_bus->list); + init_waitqueue_head(&nvdimm_bus->probe_wait); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); mutex_init(&nvdimm_bus->reconfig_mutex); if (nvdimm_bus->id < 0) { diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index b3ae86f2e1da..bdf8241b6525 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -185,7 +185,24 @@ static ssize_t commands_show(struct device *dev, } static DEVICE_ATTR_RO(commands); +static ssize_t state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + /* + * The state may be in the process of changing, userspace should + * quiesce probing if it wants a static answer + */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy) + ? "active" : "idle"); +} +static DEVICE_ATTR_RO(state); + static struct attribute *nvdimm_attributes[] = { + &dev_attr_state.attr, &dev_attr_commands.attr, NULL, }; @@ -213,7 +230,7 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, nvdimm->provider_data = provider_data; nvdimm->flags = flags; nvdimm->dsm_mask = dsm_mask; - + atomic_set(&nvdimm->busy, 0); dev = &nvdimm->dev; dev_set_name(dev, "nmem%d", nvdimm->id); dev->parent = &nvdimm_bus->dev; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 0e9b41fd2546..6a4b2c066ee7 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -14,6 +14,9 @@ #define __ND_CORE_H__ #include #include +#include +#include +#include extern struct list_head nvdimm_bus_list; extern struct mutex nvdimm_bus_list_mutex; @@ -21,10 +24,11 @@ extern int nvdimm_major; struct nvdimm_bus { struct nvdimm_bus_descriptor *nd_desc; + wait_queue_head_t probe_wait; struct module *module; struct list_head list; struct device dev; - int id; + int id, probe_active; struct mutex reconfig_mutex; }; @@ -33,6 +37,7 @@ struct nvdimm { void *provider_data; unsigned long *dsm_mask; struct device dev; + atomic_t busy; int id; }; @@ -42,10 +47,13 @@ bool is_nd_pmem(struct device *dev); struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); +void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); +void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); void nd_synchronize(void); int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); +int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus); int nd_match_dimm(struct device *dev, void *data); #endif /* __ND_CORE_H__ */ diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index bc5a08e36a25..0285e4588b03 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -35,6 +35,7 @@ struct nd_region { u64 ndr_start; int id; void *provider_data; + struct nd_interleave_set *nd_set; struct nd_mapping mapping[0]; }; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index b5c5b9095b28..1571424578f0 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -10,7 +10,10 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include +#include #include +#include #include #include "nd-core.h" #include "nd.h" @@ -133,6 +136,21 @@ static ssize_t nstype_show(struct device *dev, } static DEVICE_ATTR_RO(nstype); +static ssize_t set_cookie_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (is_nd_pmem(dev) && nd_set) + /* pass, should be precluded by region_visible */; + else + return -ENXIO; + + return sprintf(buf, "%#llx\n", nd_set->cookie); +} +static DEVICE_ATTR_RO(set_cookie); + static ssize_t init_namespaces_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -154,15 +172,65 @@ static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, &dev_attr_nstype.attr, &dev_attr_mappings.attr, + &dev_attr_set_cookie.attr, &dev_attr_init_namespaces.attr, NULL, }; +static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (a != &dev_attr_set_cookie.attr) + return a->mode; + + if (is_nd_pmem(dev) && nd_set) + return a->mode; + + return 0; +} + struct attribute_group nd_region_attribute_group = { .attrs = nd_region_attributes, + .is_visible = region_visible, }; EXPORT_SYMBOL_GPL(nd_region_attribute_group); +/* + * Upon successful probe/remove, take/release a reference on the + * associated interleave set (if present) + */ +static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, + struct device *dev, bool probe) +{ + if (is_nd_pmem(dev) || is_nd_blk(dev)) { + struct nd_region *nd_region = to_nd_region(dev); + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (probe) + atomic_inc(&nvdimm->busy); + else + atomic_dec(&nvdimm->busy); + } + } +} + +void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) +{ + nd_region_notify_driver_action(nvdimm_bus, dev, true); +} + +void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev) +{ + nd_region_notify_driver_action(nvdimm_bus, dev, false); +} + static ssize_t mappingN(struct device *dev, char *buf, int n) { struct nd_region *nd_region = to_nd_region(dev); @@ -322,6 +390,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, } nd_region->ndr_mappings = ndr_desc->num_mappings; nd_region->provider_data = ndr_desc->provider_data; + nd_region->nd_set = ndr_desc->nd_set; dev = &nd_region->dev; dev_set_name(dev, "region%d", nd_region->id); dev->parent = &nvdimm_bus->dev; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 37f966aff386..1b627b109360 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -61,11 +61,16 @@ struct nd_cmd_desc { int out_sizes[ND_CMD_MAX_ELEM]; }; +struct nd_interleave_set { + u64 cookie; +}; + struct nd_region_desc { struct resource *res; struct nd_mapping *nd_mapping; u16 num_mappings; const struct attribute_group **attr_groups; + struct nd_interleave_set *nd_set; void *provider_data; }; @@ -101,4 +106,5 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 5212e11fde4d40fa627668b4f2222d20db488f71 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Jun 2015 04:20:32 -0400 Subject: nd_btt: atomic sector updates BTT stands for Block Translation Table, and is a way to provide power fail sector atomicity semantics for block devices that have the ability to perform byte granularity IO. It relies on the capability of libnvdimm namespace devices to do byte aligned IO. The BTT works as a stacked blocked device, and reserves a chunk of space from the backing device for its accounting metadata. It is a bio-based driver because all IO is done synchronously, and there is no queuing or asynchronous completions at either the device or the driver level. The BTT uses 'lanes' to index into various 'on-disk' data structures, and lanes also act as a synchronization mechanism in case there are more CPUs than available lanes. We did a comparison between two lane lock strategies - first where we kept an atomic counter around that tracked which was the last lane that was used, and 'our' lane was determined by atomically incrementing that. That way, for the nr_cpus > nr_lanes case, theoretically, no CPU would be blocked waiting for a lane. The other strategy was to use the cpu number we're scheduled on to and hash it to a lane number. Theoretically, this could block an IO that could've otherwise run using a different, free lane. But some fio workloads showed that the direct cpu -> lane hash performed faster than tracking 'last lane' - my reasoning is the cache thrash caused by moving the atomic variable made that approach slower than simply waiting out the in-progress IO. This supports the conclusion that the driver can be a very simple bio-based one that does synchronous IOs instead of queuing. Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Neil Brown Cc: Jeff Moyer Cc: Dave Chinner Cc: Greg KH [jmoyer: fix nmi watchdog timeout in btt_map_init] [jmoyer: move btt initialization to module load path] [jmoyer: fix memory leak in the btt initialization path] [jmoyer: Don't overwrite corrupted arenas] Signed-off-by: Vishal Verma Signed-off-by: Dan Williams --- Documentation/nvdimm/btt.txt | 273 ++++++++ drivers/acpi/nfit.c | 1 + drivers/nvdimm/Kconfig | 28 +- drivers/nvdimm/Makefile | 3 + drivers/nvdimm/btt.c | 1371 +++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/btt.h | 141 ++++ drivers/nvdimm/btt_devs.c | 3 +- drivers/nvdimm/namespace_devs.c | 24 + drivers/nvdimm/nd.h | 22 +- drivers/nvdimm/pmem.c | 14 +- drivers/nvdimm/region.c | 12 + drivers/nvdimm/region_devs.c | 82 ++- include/linux/libnvdimm.h | 1 + 13 files changed, 1950 insertions(+), 25 deletions(-) create mode 100644 Documentation/nvdimm/btt.txt create mode 100644 drivers/nvdimm/btt.c (limited to 'drivers/acpi') diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt new file mode 100644 index 000000000000..95134d5ec4a0 --- /dev/null +++ b/Documentation/nvdimm/btt.txt @@ -0,0 +1,273 @@ +BTT - Block Translation Table +============================= + + +1. Introduction +--------------- + +Persistent memory based storage is able to perform IO at byte (or more +accurately, cache line) granularity. However, we often want to expose such +storage as traditional block devices. The block drivers for persistent memory +will do exactly this. However, they do not provide any atomicity guarantees. +Traditional SSDs typically provide protection against torn sectors in hardware, +using stored energy in capacitors to complete in-flight block writes, or perhaps +in firmware. We don't have this luxury with persistent memory - if a write is in +progress, and we experience a power failure, the block will contain a mix of old +and new data. Applications may not be prepared to handle such a scenario. + +The Block Translation Table (BTT) provides atomic sector update semantics for +persistent memory devices, so that applications that rely on sector writes not +being torn can continue to do so. The BTT manifests itself as a stacked block +device, and reserves a portion of the underlying storage for its metadata. At +the heart of it, is an indirection table that re-maps all the blocks on the +volume. It can be thought of as an extremely simple file system that only +provides atomic sector updates. + + +2. Static Layout +---------------- + +The underlying storage on which a BTT can be laid out is not limited in any way. +The BTT, however, splits the available space into chunks of up to 512 GiB, +called "Arenas". + +Each arena follows the same layout for its metadata, and all references in an +arena are internal to it (with the exception of one field that points to the +next arena). The following depicts the "On-disk" metadata layout: + + + Backing Store +-------> Arena ++---------------+ | +------------------+ +| | | | Arena info block | +| Arena 0 +---+ | 4K | +| 512G | +------------------+ +| | | | ++---------------+ | | +| | | | +| Arena 1 | | Data Blocks | +| 512G | | | +| | | | ++---------------+ | | +| . | | | +| . | | | +| . | | | +| | | | +| | | | ++---------------+ +------------------+ + | | + | BTT Map | + | | + | | + +------------------+ + | | + | BTT Flog | + | | + +------------------+ + | Info block copy | + | 4K | + +------------------+ + + +3. Theory of Operation +---------------------- + + +a. The BTT Map +-------------- + +The map is a simple lookup/indirection table that maps an LBA to an internal +block. Each map entry is 32 bits. The two most significant bits are special +flags, and the remaining form the internal block number. + +Bit Description +31 : TRIM flag - marks if the block was trimmed or discarded +30 : ERROR flag - marks an error block. Cleared on write. +29 - 0 : Mappings to internal 'postmap' blocks + + +Some of the terminology that will be subsequently used: + +External LBA : LBA as made visible to upper layers. +ABA : Arena Block Address - Block offset/number within an arena +Premap ABA : The block offset into an arena, which was decided upon by range + checking the External LBA +Postmap ABA : The block number in the "Data Blocks" area obtained after + indirection from the map +nfree : The number of free blocks that are maintained at any given time. + This is the number of concurrent writes that can happen to the + arena. + + +For example, after adding a BTT, we surface a disk of 1024G. We get a read for +the external LBA at 768G. This falls into the second arena, and of the 512G +worth of blocks that this arena contributes, this block is at 256G. Thus, the +premap ABA is 256G. We now refer to the map, and find out the mapping for block +'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64. + + +b. The BTT Flog +--------------- + +The BTT provides sector atomicity by making every write an "allocating write", +i.e. Every write goes to a "free" block. A running list of free blocks is +maintained in the form of the BTT flog. 'Flog' is a combination of the words +"free list" and "log". The flog contains 'nfree' entries, and an entry contains: + +lba : The premap ABA that is being written to +old_map : The old postmap ABA - after 'this' write completes, this will be a + free block. +new_map : The new postmap ABA. The map will up updated to reflect this + lba->postmap_aba mapping, but we log it here in case we have to + recover. +seq : Sequence number to mark which of the 2 sections of this flog entry is + valid/newest. It cycles between 01->10->11->01 (binary) under normal + operation, with 00 indicating an uninitialized state. +lba' : alternate lba entry +old_map': alternate old postmap entry +new_map': alternate new postmap entry +seq' : alternate sequence number. + +Each of the above fields is 32-bit, making one entry 16 bytes. Flog updates are +done such that for any entry being written, it: +a. overwrites the 'old' section in the entry based on sequence numbers +b. writes the new entry such that the sequence number is written last. + + +c. The concept of lanes +----------------------- + +While 'nfree' describes the number of concurrent IOs an arena can process +concurrently, 'nlanes' is the number of IOs the BTT device as a whole can +process. + nlanes = min(nfree, num_cpus) +A lane number is obtained at the start of any IO, and is used for indexing into +all the on-disk and in-memory data structures for the duration of the IO. It is +protected by a spinlock. + + +d. In-memory data structure: Read Tracking Table (RTT) +------------------------------------------------------ + +Consider a case where we have two threads, one doing reads and the other, +writes. We can hit a condition where the writer thread grabs a free block to do +a new IO, but the (slow) reader thread is still reading from it. In other words, +the reader consulted a map entry, and started reading the corresponding block. A +writer started writing to the same external LBA, and finished the write updating +the map for that external LBA to point to its new postmap ABA. At this point the +internal, postmap block that the reader is (still) reading has been inserted +into the list of free blocks. If another write comes in for the same LBA, it can +grab this free block, and start writing to it, causing the reader to read +incorrect data. To prevent this, we introduce the RTT. + +The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts +into rtt[lane_number], the postmap ABA it is reading, and clears it after the +read is complete. Every writer thread, after grabbing a free block, checks the +RTT for its presence. If the postmap free block is in the RTT, it waits till the +reader clears the RTT entry, and only then starts writing to it. + + +e. In-memory data structure: map locks +-------------------------------------- + +Consider a case where two writer threads are writing to the same LBA. There can +be a race in the following sequence of steps: + +free[lane] = map[premap_aba] +map[premap_aba] = postmap_aba + +Both threads can update their respective free[lane] with the same old, freed +postmap_aba. This has made the layout inconsistent by losing a free entry, and +at the same time, duplicating another free entry for two lanes. + +To solve this, we could have a single map lock (per arena) that has to be taken +before performing the above sequence, but we feel that could be too contentious. +Instead we use an array of (nfree) map_locks that is indexed by +(premap_aba modulo nfree). + + +f. Reconstruction from the Flog +------------------------------- + +On startup, we analyze the BTT flog to create our list of free blocks. We walk +through all the entries, and for each lane, of the set of two possible +'sections', we always look at the most recent one only (based on the sequence +number). The reconstruction rules/steps are simple: +- Read map[log_entry.lba]. +- If log_entry.new matches the map entry, then log_entry.old is free. +- If log_entry.new does not match the map entry, then log_entry.new is free. + (This case can only be caused by power-fails/unsafe shutdowns) + + +g. Summarizing - Read and Write flows +------------------------------------- + +Read: + +1. Convert external LBA to arena number + pre-map ABA +2. Get a lane (and take lane_lock) +3. Read map to get the entry for this pre-map ABA +4. Enter post-map ABA into RTT[lane] +5. If TRIM flag set in map, return zeroes, and end IO (go to step 8) +6. If ERROR flag set in map, end IO with EIO (go to step 8) +7. Read data from this block +8. Remove post-map ABA entry from RTT[lane] +9. Release lane (and lane_lock) + +Write: + +1. Convert external LBA to Arena number + pre-map ABA +2. Get a lane (and take lane_lock) +3. Use lane to index into in-memory free list and obtain a new block, next flog + index, next sequence number +4. Scan the RTT to check if free block is present, and spin/wait if it is. +5. Write data to this free block +6. Read map to get the existing post-map ABA entry for this pre-map ABA +7. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num] +8. Write new post-map ABA into map. +9. Write old post-map entry into the free list +10. Calculate next sequence number and write into the free list entry +11. Release lane (and lane_lock) + + +4. Error Handling +================= + +An arena would be in an error state if any of the metadata is corrupted +irrecoverably, either due to a bug or a media error. The following conditions +indicate an error: +- Info block checksum does not match (and recovering from the copy also fails) +- All internal available blocks are not uniquely and entirely addressed by the + sum of mapped blocks and free blocks (from the BTT flog). +- Rebuilding free list from the flog reveals missing/duplicate/impossible + entries +- A map entry is out of bounds + +If any of these error conditions are encountered, the arena is put into a read +only state using a flag in the info block. + + +5. In-kernel usage +================== + +Any block driver that supports byte granularity IO to the storage may register +with the BTT. It will have to provide the rw_bytes interface in its +block_device_operations struct: + + int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw); + +It may register with the BTT after it adds its own gendisk, using btt_init: + + struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize, + u32 lbasize, u8 uuid[], int maxlane); + +note that maxlane is the maximum amount of concurrency the driver wishes to +allow the BTT to use. + +The BTT 'disk' appears as a stacked block device that grabs the underlying block +device in the O_EXCL mode. + +When the driver wishes to remove the backing disk, it should similarly call +btt_fini using the same struct btt* handle that was provided to it by btt_init. + + void btt_fini(struct btt *btt); + diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 35af6f7f0abd..fc38b49eff7d 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -902,6 +902,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, } else { nd_mapping->size = nfit_mem->bdw->capacity; nd_mapping->start = nfit_mem->bdw->start_address; + ndr_desc->num_lanes = nfit_mem->bdw->windows; blk_valid = 1; } diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 5680e8e7a7aa..204ee0796411 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -8,11 +8,11 @@ menuconfig LIBNVDIMM NFIT, or otherwise can discover NVDIMM resources, a libnvdimm bus is registered to advertise PMEM (persistent memory) namespaces (/dev/pmemX) and BLK (sliding mmio window(s)) - namespaces (/dev/ndX). A PMEM namespace refers to a memory - resource that may span multiple DIMMs and support DAX (see - CONFIG_DAX). A BLK namespace refers to an NVDIMM control - region which exposes an mmio register set for windowed - access mode to non-volatile memory. + namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a + memory resource that may span multiple DIMMs and support DAX + (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control + region which exposes an mmio register set for windowed access + mode to non-volatile memory. if LIBNVDIMM @@ -20,6 +20,7 @@ config BLK_DEV_PMEM tristate "PMEM: Persistent memory block device support" default LIBNVDIMM depends on HAS_IOMEM + select ND_BTT if BTT help Memory ranges for PMEM are described by either an NFIT (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a @@ -33,7 +34,22 @@ config BLK_DEV_PMEM Say Y if you want to use an NVDIMM +config ND_BTT + tristate + config BTT - def_bool y + bool "BTT: Block Translation Table (atomic sector updates)" + default y if LIBNVDIMM + help + The Block Translation Table (BTT) provides atomic sector + update semantics for persistent memory devices, so that + applications that rely on sector writes not being torn (a + guarantee that typical disks provide) can continue to do so. + The BTT manifests itself as an alternate personality for an + NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX, + ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys, + etc...). + + Select Y if unsure endif diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 6085b4bd7312..d2aab6c58492 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -1,8 +1,11 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o nd_pmem-y := pmem.o +nd_btt-y := btt.o + libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c new file mode 100644 index 000000000000..7ae38aac2c25 --- /dev/null +++ b/drivers/nvdimm/btt.c @@ -0,0 +1,1371 @@ +/* + * Block Translation Table + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "btt.h" +#include "nd.h" + +enum log_ent_request { + LOG_NEW_ENT = 0, + LOG_OLD_ENT +}; + +static int btt_major; + +static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_read_bytes(ndns, offset, buf, n); +} + +static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_write_bytes(ndns, offset, buf, n); +} + +static int btt_info_write(struct arena_info *arena, struct btt_sb *super) +{ + int ret; + + ret = arena_write_bytes(arena, arena->info2off, super, + sizeof(struct btt_sb)); + if (ret) + return ret; + + return arena_write_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +static int btt_info_read(struct arena_info *arena, struct btt_sb *super) +{ + WARN_ON(!super); + return arena_read_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +/* + * 'raw' version of btt_map write + * Assumptions: + * mapping is in little-endian + * mapping contains 'E' and 'Z' flags as desired + */ +static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping) +{ + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE); +} + +static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, + u32 z_flag, u32 e_flag) +{ + u32 ze; + __le32 mapping_le; + + /* + * This 'mapping' is supposed to be just the LBA mapping, without + * any flags set, so strip the flag bits. + */ + mapping &= MAP_LBA_MASK; + + ze = (z_flag << 1) + e_flag; + switch (ze) { + case 0: + /* + * We want to set neither of the Z or E flags, and + * in the actual layout, this means setting the bit + * positions of both to '1' to indicate a 'normal' + * map entry + */ + mapping |= MAP_ENT_NORMAL; + break; + case 1: + mapping |= (1 << MAP_ERR_SHIFT); + break; + case 2: + mapping |= (1 << MAP_TRIM_SHIFT); + break; + default: + /* + * The case where Z and E are both sent in as '1' could be + * construed as a valid 'normal' case, but we decide not to, + * to avoid confusion + */ + WARN_ONCE(1, "Invalid use of Z and E flags\n"); + return -EIO; + } + + mapping_le = cpu_to_le32(mapping); + return __btt_map_write(arena, lba, mapping_le); +} + +static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, + int *trim, int *error) +{ + int ret; + __le32 in; + u32 raw_mapping, postmap, ze, z_flag, e_flag; + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + + ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE); + if (ret) + return ret; + + raw_mapping = le32_to_cpu(in); + + z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT; + e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT; + ze = (z_flag << 1) + e_flag; + postmap = raw_mapping & MAP_LBA_MASK; + + /* Reuse the {z,e}_flag variables for *trim and *error */ + z_flag = 0; + e_flag = 0; + + switch (ze) { + case 0: + /* Initial state. Return postmap = premap */ + *mapping = lba; + break; + case 1: + *mapping = postmap; + e_flag = 1; + break; + case 2: + *mapping = postmap; + z_flag = 1; + break; + case 3: + *mapping = postmap; + break; + default: + return -EIO; + } + + if (trim) + *trim = z_flag; + if (error) + *error = e_flag; + + return ret; +} + +static int btt_log_read_pair(struct arena_info *arena, u32 lane, + struct log_entry *ent) +{ + WARN_ON(!ent); + return arena_read_bytes(arena, + arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, + 2 * LOG_ENT_SIZE); +} + +static struct dentry *debugfs_root; + +static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, + int idx) +{ + char dirname[32]; + struct dentry *d; + + /* If for some reason, parent bttN was not created, exit */ + if (!parent) + return; + + snprintf(dirname, 32, "arena%d", idx); + d = debugfs_create_dir(dirname, parent); + if (IS_ERR_OR_NULL(d)) + return; + a->debugfs_dir = d; + + debugfs_create_x64("size", S_IRUGO, d, &a->size); + debugfs_create_x64("external_lba_start", S_IRUGO, d, + &a->external_lba_start); + debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba); + debugfs_create_u32("internal_lbasize", S_IRUGO, d, + &a->internal_lbasize); + debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba); + debugfs_create_u32("external_lbasize", S_IRUGO, d, + &a->external_lbasize); + debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree); + debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major); + debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor); + debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff); + debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff); + debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff); + debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff); + debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); + debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); + debugfs_create_x32("flags", S_IRUGO, d, &a->flags); +} + +static void btt_debugfs_init(struct btt *btt) +{ + int i = 0; + struct arena_info *arena; + + btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev), + debugfs_root); + if (IS_ERR_OR_NULL(btt->debugfs_dir)) + return; + + list_for_each_entry(arena, &btt->arena_list, list) { + arena_debugfs_init(arena, btt->debugfs_dir, i); + i++; + } +} + +/* + * This function accepts two log entries, and uses the + * sequence number to find the 'older' entry. + * It also updates the sequence number in this old entry to + * make it the 'new' one if the mark_flag is set. + * Finally, it returns which of the entries was the older one. + * + * TODO The logic feels a bit kludge-y. make it better.. + */ +static int btt_log_get_old(struct log_entry *ent) +{ + int old; + + /* + * the first ever time this is seen, the entry goes into [0] + * the next time, the following logic works out to put this + * (next) entry into [1] + */ + if (ent[0].seq == 0) { + ent[0].seq = cpu_to_le32(1); + return 0; + } + + if (ent[0].seq == ent[1].seq) + return -EINVAL; + if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) + return -EINVAL; + + if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { + if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) + old = 0; + else + old = 1; + } else { + if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) + old = 1; + else + old = 0; + } + + return old; +} + +static struct device *to_dev(struct arena_info *arena) +{ + return &arena->nd_btt->dev; +} + +/* + * This function copies the desired (old/new) log entry into ent if + * it is not NULL. It returns the sub-slot number (0 or 1) + * where the desired log entry was found. Negative return values + * indicate errors. + */ +static int btt_log_read(struct arena_info *arena, u32 lane, + struct log_entry *ent, int old_flag) +{ + int ret; + int old_ent, ret_ent; + struct log_entry log[2]; + + ret = btt_log_read_pair(arena, lane, log); + if (ret) + return -EIO; + + old_ent = btt_log_get_old(log); + if (old_ent < 0 || old_ent > 1) { + dev_info(to_dev(arena), + "log corruption (%d): lane %d seq [%d, %d]\n", + old_ent, lane, log[0].seq, log[1].seq); + /* TODO set error state? */ + return -EIO; + } + + ret_ent = (old_flag ? old_ent : (1 - old_ent)); + + if (ent != NULL) + memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); + + return ret_ent; +} + +/* + * This function commits a log entry to media + * It does _not_ prepare the freelist entry for the next write + * btt_flog_write is the wrapper for updating the freelist elements + */ +static int __btt_log_write(struct arena_info *arena, u32 lane, + u32 sub, struct log_entry *ent) +{ + int ret; + /* + * Ignore the padding in log_entry for calculating log_half. + * The entry is 'committed' when we write the sequence number, + * and we want to ensure that that is the last thing written. + * We don't bother writing the padding as that would be extra + * media wear and write amplification + */ + unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2; + u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE); + void *src = ent; + + /* split the 16B write into atomic, durable halves */ + ret = arena_write_bytes(arena, ns_off, src, log_half); + if (ret) + return ret; + + ns_off += log_half; + src += log_half; + return arena_write_bytes(arena, ns_off, src, log_half); +} + +static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, + struct log_entry *ent) +{ + int ret; + + ret = __btt_log_write(arena, lane, sub, ent); + if (ret) + return ret; + + /* prepare the next free entry */ + arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; + if (++(arena->freelist[lane].seq) == 4) + arena->freelist[lane].seq = 1; + arena->freelist[lane].block = le32_to_cpu(ent->old_map); + + return ret; +} + +/* + * This function initializes the BTT map to the initial state, which is + * all-zeroes, and indicates an identity mapping + */ +static int btt_map_init(struct arena_info *arena) +{ + int ret = -EINVAL; + void *zerobuf; + size_t offset = 0; + size_t chunk_size = SZ_2M; + size_t mapsize = arena->logoff - arena->mapoff; + + zerobuf = kzalloc(chunk_size, GFP_KERNEL); + if (!zerobuf) + return -ENOMEM; + + while (mapsize) { + size_t size = min(mapsize, chunk_size); + + ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, + size); + if (ret) + goto free; + + offset += size; + mapsize -= size; + cond_resched(); + } + + free: + kfree(zerobuf); + return ret; +} + +/* + * This function initializes the BTT log with 'fake' entries pointing + * to the initial reserved set of blocks as being free + */ +static int btt_log_init(struct arena_info *arena) +{ + int ret; + u32 i; + struct log_entry log, zerolog; + + memset(&zerolog, 0, sizeof(zerolog)); + + for (i = 0; i < arena->nfree; i++) { + log.lba = cpu_to_le32(i); + log.old_map = cpu_to_le32(arena->external_nlba + i); + log.new_map = cpu_to_le32(arena->external_nlba + i); + log.seq = cpu_to_le32(LOG_SEQ_INIT); + ret = __btt_log_write(arena, i, 0, &log); + if (ret) + return ret; + ret = __btt_log_write(arena, i, 1, &zerolog); + if (ret) + return ret; + } + + return 0; +} + +static int btt_freelist_init(struct arena_info *arena) +{ + int old, new, ret; + u32 i, map_entry; + struct log_entry log_new, log_old; + + arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry), + GFP_KERNEL); + if (!arena->freelist) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) { + old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT); + if (old < 0) + return old; + + new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); + if (new < 0) + return new; + + /* sub points to the next one to be overwritten */ + arena->freelist[i].sub = 1 - new; + arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); + arena->freelist[i].block = le32_to_cpu(log_new.old_map); + + /* This implies a newly created or untouched flog entry */ + if (log_new.old_map == log_new.new_map) + continue; + + /* Check if map recovery is needed */ + ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry, + NULL, NULL); + if (ret) + return ret; + if ((le32_to_cpu(log_new.new_map) != map_entry) && + (le32_to_cpu(log_new.old_map) == map_entry)) { + /* + * Last transaction wrote the flog, but wasn't able + * to complete the map write. So fix up the map. + */ + ret = btt_map_write(arena, le32_to_cpu(log_new.lba), + le32_to_cpu(log_new.new_map), 0, 0); + if (ret) + return ret; + } + + } + + return 0; +} + +static int btt_rtt_init(struct arena_info *arena) +{ + arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); + if (arena->rtt == NULL) + return -ENOMEM; + + return 0; +} + +static int btt_maplocks_init(struct arena_info *arena) +{ + u32 i; + + arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock), + GFP_KERNEL); + if (!arena->map_locks) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) + spin_lock_init(&arena->map_locks[i].lock); + + return 0; +} + +static struct arena_info *alloc_arena(struct btt *btt, size_t size, + size_t start, size_t arena_off) +{ + struct arena_info *arena; + u64 logsize, mapsize, datasize; + u64 available = size; + + arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL); + if (!arena) + return NULL; + arena->nd_btt = btt->nd_btt; + + if (!size) + return arena; + + arena->size = size; + arena->external_lba_start = start; + arena->external_lbasize = btt->lbasize; + arena->internal_lbasize = roundup(arena->external_lbasize, + INT_LBASIZE_ALIGNMENT); + arena->nfree = BTT_DEFAULT_NFREE; + arena->version_major = 1; + arena->version_minor = 1; + + if (available % BTT_PG_SIZE) + available -= (available % BTT_PG_SIZE); + + /* Two pages are reserved for the super block and its copy */ + available -= 2 * BTT_PG_SIZE; + + /* The log takes a fixed amount of space based on nfree */ + logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), + BTT_PG_SIZE); + available -= logsize; + + /* Calculate optimal split between map and data area */ + arena->internal_nlba = div_u64(available - BTT_PG_SIZE, + arena->internal_lbasize + MAP_ENT_SIZE); + arena->external_nlba = arena->internal_nlba - arena->nfree; + + mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE); + datasize = available - mapsize; + + /* 'Absolute' values, relative to start of storage space */ + arena->infooff = arena_off; + arena->dataoff = arena->infooff + BTT_PG_SIZE; + arena->mapoff = arena->dataoff + datasize; + arena->logoff = arena->mapoff + mapsize; + arena->info2off = arena->logoff + logsize; + return arena; +} + +static void free_arenas(struct btt *btt) +{ + struct arena_info *arena, *next; + + list_for_each_entry_safe(arena, next, &btt->arena_list, list) { + list_del(&arena->list); + kfree(arena->rtt); + kfree(arena->map_locks); + kfree(arena->freelist); + debugfs_remove_recursive(arena->debugfs_dir); + kfree(arena); + } +} + +/* + * This function checks if the metadata layout is valid and error free + */ +static int arena_is_valid(struct arena_info *arena, struct btt_sb *super, + u8 *uuid, u32 lbasize) +{ + u64 checksum; + + if (memcmp(super->uuid, uuid, 16)) + return 0; + + checksum = le64_to_cpu(super->checksum); + super->checksum = 0; + if (checksum != nd_btt_sb_checksum(super)) + return 0; + super->checksum = cpu_to_le64(checksum); + + if (lbasize != le32_to_cpu(super->external_lbasize)) + return 0; + + /* TODO: figure out action for this */ + if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0) + dev_info(to_dev(arena), "Found arena with an error flag\n"); + + return 1; +} + +/* + * This function reads an existing valid btt superblock and + * populates the corresponding arena_info struct + */ +static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, + u64 arena_off) +{ + arena->internal_nlba = le32_to_cpu(super->internal_nlba); + arena->internal_lbasize = le32_to_cpu(super->internal_lbasize); + arena->external_nlba = le32_to_cpu(super->external_nlba); + arena->external_lbasize = le32_to_cpu(super->external_lbasize); + arena->nfree = le32_to_cpu(super->nfree); + arena->version_major = le16_to_cpu(super->version_major); + arena->version_minor = le16_to_cpu(super->version_minor); + + arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off + + le64_to_cpu(super->nextoff)); + arena->infooff = arena_off; + arena->dataoff = arena_off + le64_to_cpu(super->dataoff); + arena->mapoff = arena_off + le64_to_cpu(super->mapoff); + arena->logoff = arena_off + le64_to_cpu(super->logoff); + arena->info2off = arena_off + le64_to_cpu(super->info2off); + + arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) : + (arena->info2off - arena->infooff + BTT_PG_SIZE); + + arena->flags = le32_to_cpu(super->flags); +} + +static int discover_arenas(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + struct btt_sb *super; + size_t remaining = btt->rawsize; + u64 cur_nlba = 0; + size_t cur_off = 0; + int num_arenas = 0; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return -ENOMEM; + + while (remaining) { + /* Alloc memory for arena */ + arena = alloc_arena(btt, 0, 0, 0); + if (!arena) { + ret = -ENOMEM; + goto out_super; + } + + arena->infooff = cur_off; + ret = btt_info_read(arena, super); + if (ret) + goto out; + + if (!arena_is_valid(arena, super, btt->nd_btt->uuid, + btt->lbasize)) { + if (remaining == btt->rawsize) { + btt->init_state = INIT_NOTFOUND; + dev_info(to_dev(arena), "No existing arenas\n"); + goto out; + } else { + dev_info(to_dev(arena), + "Found corrupted metadata!\n"); + ret = -ENODEV; + goto out; + } + } + + arena->external_lba_start = cur_nlba; + parse_arena_meta(arena, super, cur_off); + + ret = btt_freelist_init(arena); + if (ret) + goto out; + + ret = btt_rtt_init(arena); + if (ret) + goto out; + + ret = btt_maplocks_init(arena); + if (ret) + goto out; + + list_add_tail(&arena->list, &btt->arena_list); + + remaining -= arena->size; + cur_off += arena->size; + cur_nlba += arena->external_nlba; + num_arenas++; + + if (arena->nextoff == 0) + break; + } + btt->num_arenas = num_arenas; + btt->nlba = cur_nlba; + btt->init_state = INIT_READY; + + kfree(super); + return ret; + + out: + kfree(arena); + free_arenas(btt); + out_super: + kfree(super); + return ret; +} + +static int create_arenas(struct btt *btt) +{ + size_t remaining = btt->rawsize; + size_t cur_off = 0; + + while (remaining) { + struct arena_info *arena; + size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining); + + remaining -= arena_size; + if (arena_size < ARENA_MIN_SIZE) + break; + + arena = alloc_arena(btt, arena_size, btt->nlba, cur_off); + if (!arena) { + free_arenas(btt); + return -ENOMEM; + } + btt->nlba += arena->external_nlba; + if (remaining >= ARENA_MIN_SIZE) + arena->nextoff = arena->size; + else + arena->nextoff = 0; + cur_off += arena_size; + list_add_tail(&arena->list, &btt->arena_list); + } + + return 0; +} + +/* + * This function completes arena initialization by writing + * all the metadata. + * It is only called for an uninitialized arena when a write + * to that arena occurs for the first time. + */ +static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid) +{ + int ret; + struct btt_sb *super; + + ret = btt_map_init(arena); + if (ret) + return ret; + + ret = btt_log_init(arena); + if (ret) + return ret; + + super = kzalloc(sizeof(struct btt_sb), GFP_NOIO); + if (!super) + return -ENOMEM; + + strncpy(super->signature, BTT_SIG, BTT_SIG_LEN); + memcpy(super->uuid, uuid, 16); + super->flags = cpu_to_le32(arena->flags); + super->version_major = cpu_to_le16(arena->version_major); + super->version_minor = cpu_to_le16(arena->version_minor); + super->external_lbasize = cpu_to_le32(arena->external_lbasize); + super->external_nlba = cpu_to_le32(arena->external_nlba); + super->internal_lbasize = cpu_to_le32(arena->internal_lbasize); + super->internal_nlba = cpu_to_le32(arena->internal_nlba); + super->nfree = cpu_to_le32(arena->nfree); + super->infosize = cpu_to_le32(sizeof(struct btt_sb)); + super->nextoff = cpu_to_le64(arena->nextoff); + /* + * Subtract arena->infooff (arena start) so numbers are relative + * to 'this' arena + */ + super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff); + super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff); + super->logoff = cpu_to_le64(arena->logoff - arena->infooff); + super->info2off = cpu_to_le64(arena->info2off - arena->infooff); + + super->flags = 0; + super->checksum = cpu_to_le64(nd_btt_sb_checksum(super)); + + ret = btt_info_write(arena, super); + + kfree(super); + return ret; +} + +/* + * This function completes the initialization for the BTT namespace + * such that it is ready to accept IOs + */ +static int btt_meta_init(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + + mutex_lock(&btt->init_lock); + list_for_each_entry(arena, &btt->arena_list, list) { + ret = btt_arena_write_layout(arena, btt->nd_btt->uuid); + if (ret) + goto unlock; + + ret = btt_freelist_init(arena); + if (ret) + goto unlock; + + ret = btt_rtt_init(arena); + if (ret) + goto unlock; + + ret = btt_maplocks_init(arena); + if (ret) + goto unlock; + } + + btt->init_state = INIT_READY; + + unlock: + mutex_unlock(&btt->init_lock); + return ret; +} + +/* + * This function calculates the arena in which the given LBA lies + * by doing a linear walk. This is acceptable since we expect only + * a few arenas. If we have backing devices that get much larger, + * we can construct a balanced binary tree of arenas at init time + * so that this range search becomes faster. + */ +static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, + struct arena_info **arena) +{ + struct arena_info *arena_list; + __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size); + + list_for_each_entry(arena_list, &btt->arena_list, list) { + if (lba < arena_list->external_nlba) { + *arena = arena_list; + *premap = lba; + return 0; + } + lba -= arena_list->external_nlba; + } + + return -EIO; +} + +/* + * The following (lock_map, unlock_map) are mostly just to improve + * readability, since they index into an array of locks + */ +static void lock_map(struct arena_info *arena, u32 premap) + __acquires(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_lock(&arena->map_locks[idx].lock); +} + +static void unlock_map(struct arena_info *arena, u32 premap) + __releases(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_unlock(&arena->map_locks[idx].lock); +} + +static u64 to_namespace_offset(struct arena_info *arena, u64 lba) +{ + return arena->dataoff + ((u64)lba * arena->internal_lbasize); +} + +static int btt_data_read(struct arena_info *arena, struct page *page, + unsigned int off, u32 lba, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_read_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static int btt_data_write(struct arena_info *arena, u32 lba, + struct page *page, unsigned int off, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_write_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static void zero_fill_data(struct page *page, unsigned int off, u32 len) +{ + void *mem = kmap_atomic(page); + + memset(mem + off, 0, len); + kunmap_atomic(mem); +} + +static int btt_read_pg(struct btt *btt, struct page *page, unsigned int off, + sector_t sector, unsigned int len) +{ + int ret = 0; + int t_flag, e_flag; + struct arena_info *arena = NULL; + u32 lane = 0, premap, postmap; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + + cur_len = min(btt->sector_size, len); + + ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag); + if (ret) + goto out_lane; + + /* + * We loop to make sure that the post map LBA didn't change + * from under us between writing the RTT and doing the actual + * read. + */ + while (1) { + u32 new_map; + + if (t_flag) { + zero_fill_data(page, off, cur_len); + goto out_lane; + } + + if (e_flag) { + ret = -EIO; + goto out_lane; + } + + arena->rtt[lane] = RTT_VALID | postmap; + /* + * Barrier to make sure this write is not reordered + * to do the verification map_read before the RTT store + */ + barrier(); + + ret = btt_map_read(arena, premap, &new_map, &t_flag, + &e_flag); + if (ret) + goto out_rtt; + + if (postmap == new_map) + break; + + postmap = new_map; + } + + ret = btt_data_read(arena, page, off, postmap, cur_len); + if (ret) + goto out_rtt; + + arena->rtt[lane] = RTT_INVALID; + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_rtt: + arena->rtt[lane] = RTT_INVALID; + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_write_pg(struct btt *btt, sector_t sector, struct page *page, + unsigned int off, unsigned int len) +{ + int ret = 0; + struct arena_info *arena = NULL; + u32 premap = 0, old_postmap, new_postmap, lane = 0, i; + struct log_entry log; + int sub; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + cur_len = min(btt->sector_size, len); + + if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) { + ret = -EIO; + goto out_lane; + } + + new_postmap = arena->freelist[lane].block; + + /* Wait if the new block is being read from */ + for (i = 0; i < arena->nfree; i++) + while (arena->rtt[i] == (RTT_VALID | new_postmap)) + cpu_relax(); + + + if (new_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_lane; + } else + ret = btt_data_write(arena, new_postmap, page, + off, cur_len); + if (ret) + goto out_lane; + + lock_map(arena, premap); + ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL); + if (ret) + goto out_map; + if (old_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_map; + } + + log.lba = cpu_to_le32(premap); + log.old_map = cpu_to_le32(old_postmap); + log.new_map = cpu_to_le32(new_postmap); + log.seq = cpu_to_le32(arena->freelist[lane].seq); + sub = arena->freelist[lane].sub; + ret = btt_flog_write(arena, lane, sub, &log); + if (ret) + goto out_map; + + ret = btt_map_write(arena, premap, new_postmap, 0, 0); + if (ret) + goto out_map; + + unlock_map(arena, premap); + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_map: + unlock_map(arena, premap); + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_do_bvec(struct btt *btt, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + int ret; + + if (rw == READ) { + ret = btt_read_pg(btt, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + ret = btt_write_pg(btt, sector, page, off, len); + } + + return ret; +} + +static void btt_make_request(struct request_queue *q, struct bio *bio) +{ + struct btt *btt = q->queuedata; + struct bvec_iter iter; + struct bio_vec bvec; + int err = 0, rw; + + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + BUG_ON(len > PAGE_SIZE); + /* Make sure len is in multiples of sector size. */ + /* XXX is this right? */ + BUG_ON(len < btt->sector_size); + BUG_ON(len % btt->sector_size); + + err = btt_do_bvec(btt, bvec.bv_page, len, bvec.bv_offset, + rw, iter.bi_sector); + if (err) { + dev_info(&btt->nd_btt->dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); + goto out; + } + } + +out: + bio_endio(bio, err); +} + +static int btt_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct btt *btt = bdev->bd_disk->private_data; + + btt_do_bvec(btt, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, 0); + return 0; +} + + +static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + +static const struct block_device_operations btt_fops = { + .owner = THIS_MODULE, + .rw_page = btt_rw_page, + .getgeo = btt_getgeo, +}; + +static int btt_blk_init(struct btt *btt) +{ + struct nd_btt *nd_btt = btt->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* create a new disk and request queue for btt */ + btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + if (!btt->btt_queue) + return -ENOMEM; + + btt->btt_disk = alloc_disk(0); + if (!btt->btt_disk) { + blk_cleanup_queue(btt->btt_queue); + return -ENOMEM; + } + + nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); + btt->btt_disk->driverfs_dev = &btt->nd_btt->dev; + btt->btt_disk->major = btt_major; + btt->btt_disk->first_minor = 0; + btt->btt_disk->fops = &btt_fops; + btt->btt_disk->private_data = btt; + btt->btt_disk->queue = btt->btt_queue; + btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + + blk_queue_make_request(btt->btt_queue, btt_make_request); + blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); + blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); + blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue); + btt->btt_queue->queuedata = btt; + + set_capacity(btt->btt_disk, + btt->nlba * btt->sector_size >> SECTOR_SHIFT); + add_disk(btt->btt_disk); + + return 0; +} + +static void btt_blk_cleanup(struct btt *btt) +{ + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); + blk_cleanup_queue(btt->btt_queue); +} + +/** + * btt_init - initialize a block translation table for the given device + * @nd_btt: device with BTT geometry and backing device info + * @rawsize: raw size in bytes of the backing device + * @lbasize: lba size of the backing device + * @uuid: A uuid for the backing device - this is stored on media + * @maxlane: maximum number of parallel requests the device can handle + * + * Initialize a Block Translation Table on a backing device to provide + * single sector power fail atomicity. + * + * Context: + * Might sleep. + * + * Returns: + * Pointer to a new struct btt on success, NULL on failure. + */ +static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, + u32 lbasize, u8 *uuid, struct nd_region *nd_region) +{ + int ret; + struct btt *btt; + struct device *dev = &nd_btt->dev; + + btt = kzalloc(sizeof(struct btt), GFP_KERNEL); + if (!btt) + return NULL; + + btt->nd_btt = nd_btt; + btt->rawsize = rawsize; + btt->lbasize = lbasize; + btt->sector_size = ((lbasize >= 4096) ? 4096 : 512); + INIT_LIST_HEAD(&btt->arena_list); + mutex_init(&btt->init_lock); + btt->nd_region = nd_region; + + ret = discover_arenas(btt); + if (ret) { + dev_err(dev, "init: error in arena_discover: %d\n", ret); + goto out_free; + } + + if (btt->init_state != INIT_READY) { + btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + + ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); + dev_dbg(dev, "init: %d arenas for %llu rawsize\n", + btt->num_arenas, rawsize); + + ret = create_arenas(btt); + if (ret) { + dev_info(dev, "init: create_arenas: %d\n", ret); + goto out_free; + } + + ret = btt_meta_init(btt); + if (ret) { + dev_err(dev, "init: error in meta_init: %d\n", ret); + return NULL; + } + } + + ret = btt_blk_init(btt); + if (ret) { + dev_err(dev, "init: error in blk_init: %d\n", ret); + goto out_free; + } + + btt_debugfs_init(btt); + + return btt; + + out_free: + kfree(btt); + return NULL; +} + +/** + * btt_fini - de-initialize a BTT + * @btt: the BTT handle that was generated by btt_init + * + * De-initialize a Block Translation Table on device removal + * + * Context: + * Might sleep. + */ +static void btt_fini(struct btt *btt) +{ + if (btt) { + btt_blk_cleanup(btt); + free_arenas(btt); + debugfs_remove_recursive(btt->debugfs_dir); + kfree(btt); + } +} + +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct nd_region *nd_region; + struct btt *btt; + size_t rawsize; + + if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) + return -ENODEV; + + rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K; + if (rawsize < ARENA_MIN_SIZE) { + return -ENXIO; + } + nd_region = to_nd_region(nd_btt->dev.parent); + btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid, + nd_region); + if (!btt) + return -ENOMEM; + nd_btt->btt = btt; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_attach_btt); + +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct btt *btt = nd_btt->btt; + + btt_fini(btt); + nd_btt->btt = NULL; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_detach_btt); + +static int __init nd_btt_init(void) +{ + int rc; + + BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K); + + btt_major = register_blkdev(0, "btt"); + if (btt_major < 0) + return btt_major; + + debugfs_root = debugfs_create_dir("btt", NULL); + if (IS_ERR_OR_NULL(debugfs_root)) { + rc = -ENXIO; + goto err_debugfs; + } + + return 0; + + err_debugfs: + unregister_blkdev(btt_major, "btt"); + + return rc; +} + +static void __exit nd_btt_exit(void) +{ + debugfs_remove_recursive(debugfs_root); + unregister_blkdev(btt_major, "btt"); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT); +MODULE_AUTHOR("Vishal Verma "); +MODULE_LICENSE("GPL v2"); +module_init(nd_btt_init); +module_exit(nd_btt_exit); diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index e8f6d8e0ddd3..8c95a7792c3e 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -19,6 +19,39 @@ #define BTT_SIG_LEN 16 #define BTT_SIG "BTT_ARENA_INFO\0" +#define MAP_ENT_SIZE 4 +#define MAP_TRIM_SHIFT 31 +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT) +#define MAP_ERR_SHIFT 30 +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) +#define MAP_ENT_NORMAL 0xC0000000 +#define LOG_ENT_SIZE sizeof(struct log_entry) +#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ +#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ +#define RTT_VALID (1UL << 31) +#define RTT_INVALID 0 +#define INT_LBASIZE_ALIGNMENT 256 +#define BTT_PG_SIZE 4096 +#define BTT_DEFAULT_NFREE ND_MAX_LANES +#define LOG_SEQ_INIT 1 + +#define IB_FLAG_ERROR 0x00000001 +#define IB_FLAG_ERROR_MASK 0x00000001 + +enum btt_init_state { + INIT_UNCHECKED = 0, + INIT_NOTFOUND, + INIT_READY +}; + +struct log_entry { + __le32 lba; + __le32 old_map; + __le32 new_map; + __le32 seq; + __le64 padding[2]; +}; struct btt_sb { u8 signature[BTT_SIG_LEN]; @@ -42,4 +75,112 @@ struct btt_sb { __le64 checksum; }; +struct free_entry { + u32 block; + u8 sub; + u8 seq; +}; + +struct aligned_lock { + union { + spinlock_t lock; + u8 cacheline_padding[L1_CACHE_BYTES]; + }; +}; + +/** + * struct arena_info - handle for an arena + * @size: Size in bytes this arena occupies on the raw device. + * This includes arena metadata. + * @external_lba_start: The first external LBA in this arena. + * @internal_nlba: Number of internal blocks available in the arena + * including nfree reserved blocks + * @internal_lbasize: Internal and external lba sizes may be different as + * we can round up 'odd' external lbasizes such as 520B + * to be aligned. + * @external_nlba: Number of blocks contributed by the arena to the number + * reported to upper layers. (internal_nlba - nfree) + * @external_lbasize: LBA size as exposed to upper layers. + * @nfree: A reserve number of 'free' blocks that is used to + * handle incoming writes. + * @version_major: Metadata layout version major. + * @version_minor: Metadata layout version minor. + * @nextoff: Offset in bytes to the start of the next arena. + * @infooff: Offset in bytes to the info block of this arena. + * @dataoff: Offset in bytes to the data area of this arena. + * @mapoff: Offset in bytes to the map area of this arena. + * @logoff: Offset in bytes to the log area of this arena. + * @info2off: Offset in bytes to the backup info block of this arena. + * @freelist: Pointer to in-memory list of free blocks + * @rtt: Pointer to in-memory "Read Tracking Table" + * @map_locks: Spinlocks protecting concurrent map writes + * @nd_btt: Pointer to parent nd_btt structure. + * @list: List head for list of arenas + * @debugfs_dir: Debugfs dentry + * @flags: Arena flags - may signify error states. + * + * arena_info is a per-arena handle. Once an arena is narrowed down for an + * IO, this struct is passed around for the duration of the IO. + */ +struct arena_info { + u64 size; /* Total bytes for this arena */ + u64 external_lba_start; + u32 internal_nlba; + u32 internal_lbasize; + u32 external_nlba; + u32 external_lbasize; + u32 nfree; + u16 version_major; + u16 version_minor; + /* Byte offsets to the different on-media structures */ + u64 nextoff; + u64 infooff; + u64 dataoff; + u64 mapoff; + u64 logoff; + u64 info2off; + /* Pointers to other in-memory structures for this arena */ + struct free_entry *freelist; + u32 *rtt; + struct aligned_lock *map_locks; + struct nd_btt *nd_btt; + struct list_head list; + struct dentry *debugfs_dir; + /* Arena flags */ + u32 flags; +}; + +/** + * struct btt - handle for a BTT instance + * @btt_disk: Pointer to the gendisk for BTT device + * @btt_queue: Pointer to the request queue for the BTT device + * @arena_list: Head of the list of arenas + * @debugfs_dir: Debugfs dentry + * @nd_btt: Parent nd_btt struct + * @nlba: Number of logical blocks exposed to the upper layers + * after removing the amount of space needed by metadata + * @rawsize: Total size in bytes of the available backing device + * @lbasize: LBA size as requested and presented to upper layers. + * This is sector_size + size of any metadata. + * @sector_size: The Linux sector size - 512 or 4096 + * @lanes: Per-lane spinlocks + * @init_lock: Mutex used for the BTT initialization + * @init_state: Flag describing the initialization state for the BTT + * @num_arenas: Number of arenas in the BTT instance + */ +struct btt { + struct gendisk *btt_disk; + struct request_queue *btt_queue; + struct list_head arena_list; + struct dentry *debugfs_dir; + struct nd_btt *nd_btt; + u64 nlba; + unsigned long long rawsize; + u32 lbasize; + u32 sector_size; + struct nd_region *nd_region; + struct mutex init_lock; + int init_state; + int num_arenas; +}; #endif diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index effb70a88347..470fbdccd0ac 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -348,7 +348,8 @@ struct device *nd_btt_create(struct nd_region *nd_region) */ u64 nd_btt_sb_checksum(struct btt_sb *btt_sb) { - u64 sum, sum_save; + u64 sum; + __le64 sum_save; sum_save = btt_sb->checksum; btt_sb->checksum = 0; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 2c50a0719f8d..4aa647c8d644 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -76,6 +76,30 @@ static bool is_namespace_io(struct device *dev) return dev ? dev->type == &namespace_io_device_type : false; } +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name) +{ + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + const char *suffix = ""; + + if (ndns->claim && is_nd_btt(ndns->claim)) + suffix = "s"; + + if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) + sprintf(name, "pmem%d%s", nd_region->id, suffix); + else if (is_namespace_blk(&ndns->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix); + } else { + return NULL; + } + + return name; +} +EXPORT_SYMBOL(nvdimm_namespace_disk_name); + static ssize_t nstype_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index d13eccbb67e9..1b937c235913 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -20,6 +20,12 @@ #include "label.h" enum { + /* + * Limits the maximum number of block apertures a dimm can + * support and is an input to the geometry/on-disk-format of a + * BTT instance + */ + ND_MAX_LANES = 256, SECTOR_SHIFT = 9, }; @@ -75,6 +81,11 @@ static inline struct nd_namespace_index *to_next_namespace_index( for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \ res; res = next, next = next ? next->sibling : NULL) +struct nd_percpu_lane { + int count; + spinlock_t lock; +}; + struct nd_region { struct device dev; struct ida ns_ida; @@ -84,9 +95,10 @@ struct nd_region { u16 ndr_mappings; u64 ndr_size; u64 ndr_start; - int id; + int id, num_lanes; void *provider_data; struct nd_interleave_set *nd_set; + struct nd_percpu_lane __percpu *lane; struct nd_mapping mapping[0]; }; @@ -100,9 +112,11 @@ static inline unsigned nd_inc_seq(unsigned seq) return next[seq & 3]; } +struct btt; struct nd_btt { struct device dev; struct nd_namespace_common *ndns; + struct btt *btt; unsigned long lbasize; u8 *uuid; int id; @@ -157,6 +171,8 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) #endif struct nd_region *to_nd_region(struct device *dev); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); @@ -172,4 +188,8 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, resource_size_t n); resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns); struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev); +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name); #endif /* __ND_H__ */ diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d0c6b4bdba69..7346054bccbb 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -160,7 +160,6 @@ static void pmem_detach_disk(struct pmem_device *pmem) static int pmem_attach_disk(struct nd_namespace_common *ndns, struct pmem_device *pmem) { - struct nd_region *nd_region = to_nd_region(ndns->dev.parent); struct gendisk *disk; pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); @@ -183,7 +182,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns, disk->private_data = pmem; disk->queue = pmem->pmem_queue; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", nd_region->id); + nvdimm_namespace_disk_name(ndns, disk->disk_name); disk->driverfs_dev = &ndns->dev; set_capacity(disk, pmem->size >> 9); pmem->pmem_disk = disk; @@ -211,17 +210,6 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns, return 0; } -static int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) -{ - /* TODO */ - return -ENXIO; -} - -static void nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) -{ - /* TODO */ -} - static void pmem_free(struct pmem_device *pmem) { iounmap(pmem->virt_addr); diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index 2a5f3f53d79d..eb8aebcd4800 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -10,6 +10,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include #include #include #include @@ -18,10 +19,21 @@ static int nd_region_probe(struct device *dev) { int err; + static unsigned long once; struct nd_region_namespaces *num_ns; struct nd_region *nd_region = to_nd_region(dev); int rc = nd_region_register_namespaces(nd_region, &err); + if (nd_region->num_lanes > num_online_cpus() + && nd_region->num_lanes < num_possible_cpus() + && !test_and_set_bit(0, &once)) { + dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n", + num_online_cpus(), nd_region->num_lanes, + num_possible_cpus()); + dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n", + nd_region->num_lanes); + } + num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); if (!num_ns) return -ENOMEM; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4fd92389fa7e..fe8ec21fe3d7 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -32,6 +32,7 @@ static void nd_region_release(struct device *dev) put_device(&nvdimm->dev); } + free_percpu(nd_region->lane); ida_simple_remove(®ion_ida, nd_region->id); kfree(nd_region); } @@ -531,13 +532,66 @@ void *nd_region_provider_data(struct nd_region *nd_region) } EXPORT_SYMBOL_GPL(nd_region_provider_data); +/** + * nd_region_acquire_lane - allocate and lock a lane + * @nd_region: region id and number of lanes possible + * + * A lane correlates to a BLK-data-window and/or a log slot in the BTT. + * We optimize for the common case where there are 256 lanes, one + * per-cpu. For larger systems we need to lock to share lanes. For now + * this implementation assumes the cost of maintaining an allocator for + * free lanes is on the order of the lock hold time, so it implements a + * static lane = cpu % num_lanes mapping. + * + * In the case of a BTT instance on top of a BLK namespace a lane may be + * acquired recursively. We lock on the first instance. + * + * In the case of a BTT instance on top of PMEM, we only acquire a lane + * for the BTT metadata updates. + */ +unsigned int nd_region_acquire_lane(struct nd_region *nd_region) +{ + unsigned int cpu, lane; + + cpu = get_cpu(); + if (nd_region->num_lanes < nr_cpu_ids) { + struct nd_percpu_lane *ndl_lock, *ndl_count; + + lane = cpu % nd_region->num_lanes; + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (ndl_count->count++ == 0) + spin_lock(&ndl_lock->lock); + } else + lane = cpu; + + return lane; +} +EXPORT_SYMBOL(nd_region_acquire_lane); + +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane) +{ + if (nd_region->num_lanes < nr_cpu_ids) { + unsigned int cpu = get_cpu(); + struct nd_percpu_lane *ndl_lock, *ndl_count; + + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (--ndl_count->count == 0) + spin_unlock(&ndl_lock->lock); + put_cpu(); + } + put_cpu(); +} +EXPORT_SYMBOL(nd_region_release_lane); + static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc, struct device_type *dev_type, const char *caller) { struct nd_region *nd_region; struct device *dev; - u16 i; + unsigned int i; for (i = 0; i < ndr_desc->num_mappings; i++) { struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; @@ -557,9 +611,19 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, if (!nd_region) return NULL; nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); - if (nd_region->id < 0) { - kfree(nd_region); - return NULL; + if (nd_region->id < 0) + goto err_id; + + nd_region->lane = alloc_percpu(struct nd_percpu_lane); + if (!nd_region->lane) + goto err_percpu; + + for (i = 0; i < nr_cpu_ids; i++) { + struct nd_percpu_lane *ndl; + + ndl = per_cpu_ptr(nd_region->lane, i); + spin_lock_init(&ndl->lock); + ndl->count = 0; } memcpy(nd_region->mapping, ndr_desc->nd_mapping, @@ -573,6 +637,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->ndr_mappings = ndr_desc->num_mappings; nd_region->provider_data = ndr_desc->provider_data; nd_region->nd_set = ndr_desc->nd_set; + nd_region->num_lanes = ndr_desc->num_lanes; ida_init(&nd_region->ns_ida); ida_init(&nd_region->btt_ida); dev = &nd_region->dev; @@ -585,11 +650,18 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_device_register(dev); return nd_region; + + err_percpu: + ida_simple_remove(®ion_ida, nd_region->id); + err_id: + kfree(nd_region); + return NULL; } struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc) { + ndr_desc->num_lanes = ND_MAX_LANES; return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type, __func__); } @@ -600,6 +672,7 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, { if (ndr_desc->num_mappings > 1) return NULL; + ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES); return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type, __func__); } @@ -608,6 +681,7 @@ EXPORT_SYMBOL_GPL(nvdimm_blk_region_create); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc) { + ndr_desc->num_lanes = ND_MAX_LANES; return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type, __func__); } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index a59dca17b3aa..531d99dfac68 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -85,6 +85,7 @@ struct nd_region_desc { const struct attribute_group **attr_groups; struct nd_interleave_set *nd_set; void *provider_data; + int num_lanes; }; struct nvdimm_bus; -- cgit v1.2.3 From 047fc8a1f9a6330eacc80374dff087e20dc2304b Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 25 Jun 2015 04:21:02 -0400 Subject: libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory The libnvdimm implementation handles allocating dimm address space (DPA) between PMEM and BLK mode interfaces. After DPA has been allocated from a BLK-region to a BLK-namespace the nd_blk driver attaches to handle I/O as a struct bio based block device. Unlike PMEM, BLK is required to handle platform specific details like mmio register formats and memory controller interleave. For this reason the libnvdimm generic nd_blk driver calls back into the bus provider to carry out the I/O. This initial implementation handles the BLK interface defined by the ACPI 6 NFIT [1] and the NVDIMM DSM Interface Example [2] composed from DCR (dimm control region), BDW (block data window), IDT (interleave descriptor) NFIT structures and the hardware register format. [1]: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf [2]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf Cc: Andy Lutomirski Cc: Boaz Harrosh Cc: H. Peter Anvin Cc: Jens Axboe Cc: Ingo Molnar Cc: Christoph Hellwig Signed-off-by: Ross Zwisler Acked-by: Rafael J. Wysocki Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 449 ++++++++++++++++++++++++++++++++++++++-- drivers/acpi/nfit.h | 49 +++++ drivers/nvdimm/Kconfig | 13 ++ drivers/nvdimm/Makefile | 3 + drivers/nvdimm/blk.c | 245 ++++++++++++++++++++++ drivers/nvdimm/dimm_devs.c | 9 + drivers/nvdimm/namespace_devs.c | 65 +++++- drivers/nvdimm/nd-core.h | 3 +- drivers/nvdimm/nd.h | 13 +- drivers/nvdimm/region.c | 8 +- drivers/nvdimm/region_devs.c | 91 +++++++- include/linux/libnvdimm.h | 27 ++- 12 files changed, 941 insertions(+), 34 deletions(-) create mode 100644 drivers/nvdimm/blk.c (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index fc38b49eff7d..2aa6aa97b40c 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -13,12 +13,20 @@ #include #include #include +#include #include #include #include #include +#include #include "nfit.h" +/* + * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is + * irrelevant. + */ +#include + static bool force_enable_dimms; module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); @@ -72,7 +80,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, if (!adev) return -ENOTTY; - dimm_name = dev_name(&adev->dev); + dimm_name = nvdimm_name(nvdimm); cmd_name = nvdimm_cmd_name(cmd); dsm_mask = nfit_mem->dsm_mask; desc = nd_cmd_dimm_desc(cmd); @@ -279,6 +287,23 @@ static bool add_bdw(struct acpi_nfit_desc *acpi_desc, return true; } +static bool add_idt(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_interleave *idt) +{ + struct device *dev = acpi_desc->dev; + struct nfit_idt *nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt), + GFP_KERNEL); + + if (!nfit_idt) + return false; + INIT_LIST_HEAD(&nfit_idt->list); + nfit_idt->idt = idt; + list_add_tail(&nfit_idt->list, &acpi_desc->idts); + dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__, + idt->interleave_index, idt->line_count); + return true; +} + static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table, const void *end) { @@ -307,9 +332,9 @@ static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table, if (!add_bdw(acpi_desc, table)) return err; break; - /* TODO */ case ACPI_NFIT_TYPE_INTERLEAVE: - dev_dbg(dev, "%s: idt\n", __func__); + if (!add_idt(acpi_desc, table)) + return err; break; case ACPI_NFIT_TYPE_FLUSH_ADDRESS: dev_dbg(dev, "%s: flush\n", __func__); @@ -362,8 +387,11 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa) { u16 dcr = __to_nfit_memdev(nfit_mem)->region_index; + struct nfit_memdev *nfit_memdev; struct nfit_dcr *nfit_dcr; struct nfit_bdw *nfit_bdw; + struct nfit_idt *nfit_idt; + u16 idt_idx, range_index; list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) { if (nfit_dcr->dcr->region_index != dcr) @@ -396,6 +424,26 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc, return 0; nfit_mem_find_spa_bdw(acpi_desc, nfit_mem); + + if (!nfit_mem->spa_bdw) + return 0; + + range_index = nfit_mem->spa_bdw->range_index; + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { + if (nfit_memdev->memdev->range_index != range_index || + nfit_memdev->memdev->region_index != dcr) + continue; + nfit_mem->memdev_bdw = nfit_memdev->memdev; + idt_idx = nfit_memdev->memdev->interleave_index; + list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { + if (nfit_idt->idt->interleave_index != idt_idx) + continue; + nfit_mem->idt_bdw = nfit_idt->idt; + break; + } + break; + } + return 0; } @@ -439,9 +487,19 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc, } if (type == NFIT_SPA_DCR) { + struct nfit_idt *nfit_idt; + u16 idt_idx; + /* multiple dimms may share a SPA when interleaved */ nfit_mem->spa_dcr = spa; nfit_mem->memdev_dcr = nfit_memdev->memdev; + idt_idx = nfit_memdev->memdev->interleave_index; + list_for_each_entry(nfit_idt, &acpi_desc->idts, list) { + if (nfit_idt->idt->interleave_index != idt_idx) + continue; + nfit_mem->idt_dcr = nfit_idt->idt; + break; + } } else { /* * A single dimm may belong to multiple SPA-PM @@ -871,6 +929,359 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, return 0; } +static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio) +{ + struct acpi_nfit_interleave *idt = mmio->idt; + u32 sub_line_offset, line_index, line_offset; + u64 line_no, table_skip_count, table_offset; + + line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset); + table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index); + line_offset = idt->line_offset[line_index] + * mmio->line_size; + table_offset = table_skip_count * mmio->table_size; + + return mmio->base_offset + line_offset + table_offset + sub_line_offset; +} + +static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw) +{ + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; + u64 offset = nfit_blk->stat_offset + mmio->size * bw; + + if (mmio->num_lines) + offset = to_interleave_offset(offset, mmio); + + return readq(mmio->base + offset); +} + +static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, + resource_size_t dpa, unsigned int len, unsigned int write) +{ + u64 cmd, offset; + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR]; + + enum { + BCW_OFFSET_MASK = (1ULL << 48)-1, + BCW_LEN_SHIFT = 48, + BCW_LEN_MASK = (1ULL << 8) - 1, + BCW_CMD_SHIFT = 56, + }; + + cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK; + len = len >> L1_CACHE_SHIFT; + cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT; + cmd |= ((u64) write) << BCW_CMD_SHIFT; + + offset = nfit_blk->cmd_offset + mmio->size * bw; + if (mmio->num_lines) + offset = to_interleave_offset(offset, mmio); + + writeq(cmd, mmio->base + offset); + /* FIXME: conditionally perform read-back if mandated by firmware */ +} + +static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, + resource_size_t dpa, void *iobuf, size_t len, int rw, + unsigned int lane) +{ + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + unsigned int copied = 0; + u64 base_offset; + int rc; + + base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES + + lane * mmio->size; + /* TODO: non-temporal access, flush hints, cache management etc... */ + write_blk_ctl(nfit_blk, lane, dpa, len, rw); + while (len) { + unsigned int c; + u64 offset; + + if (mmio->num_lines) { + u32 line_offset; + + offset = to_interleave_offset(base_offset + copied, + mmio); + div_u64_rem(offset, mmio->line_size, &line_offset); + c = min_t(size_t, len, mmio->line_size - line_offset); + } else { + offset = base_offset + nfit_blk->bdw_offset; + c = len; + } + + if (rw) + memcpy(mmio->aperture + offset, iobuf + copied, c); + else + memcpy(iobuf + copied, mmio->aperture + offset, c); + + copied += c; + len -= c; + } + rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0; + return rc; +} + +static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr, + resource_size_t dpa, void *iobuf, u64 len, int rw) +{ + struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr); + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + struct nd_region *nd_region = nfit_blk->nd_region; + unsigned int lane, copied = 0; + int rc = 0; + + lane = nd_region_acquire_lane(nd_region); + while (len) { + u64 c = min(len, mmio->size); + + rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied, + iobuf + copied, c, rw, lane); + if (rc) + break; + + copied += c; + len -= c; + } + nd_region_release_lane(nd_region, lane); + + return rc; +} + +static void nfit_spa_mapping_release(struct kref *kref) +{ + struct nfit_spa_mapping *spa_map = to_spa_map(kref); + struct acpi_nfit_system_address *spa = spa_map->spa; + struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index); + iounmap(spa_map->iomem); + release_mem_region(spa->address, spa->length); + list_del(&spa_map->list); + kfree(spa_map); +} + +static struct nfit_spa_mapping *find_spa_mapping( + struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_spa_mapping *spa_map; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + list_for_each_entry(spa_map, &acpi_desc->spa_maps, list) + if (spa_map->spa == spa) + return spa_map; + + return NULL; +} + +static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + struct nfit_spa_mapping *spa_map; + + mutex_lock(&acpi_desc->spa_map_mutex); + spa_map = find_spa_mapping(acpi_desc, spa); + + if (spa_map) + kref_put(&spa_map->kref, nfit_spa_mapping_release); + mutex_unlock(&acpi_desc->spa_map_mutex); +} + +static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + resource_size_t start = spa->address; + resource_size_t n = spa->length; + struct nfit_spa_mapping *spa_map; + struct resource *res; + + WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); + + spa_map = find_spa_mapping(acpi_desc, spa); + if (spa_map) { + kref_get(&spa_map->kref); + return spa_map->iomem; + } + + spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL); + if (!spa_map) + return NULL; + + INIT_LIST_HEAD(&spa_map->list); + spa_map->spa = spa; + kref_init(&spa_map->kref); + spa_map->acpi_desc = acpi_desc; + + res = request_mem_region(start, n, dev_name(acpi_desc->dev)); + if (!res) + goto err_mem; + + /* TODO: cacheability based on the spa type */ + spa_map->iomem = ioremap_nocache(start, n); + if (!spa_map->iomem) + goto err_map; + + list_add_tail(&spa_map->list, &acpi_desc->spa_maps); + return spa_map->iomem; + + err_map: + release_mem_region(start, n); + err_mem: + kfree(spa_map); + return NULL; +} + +/** + * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges + * @nvdimm_bus: NFIT-bus that provided the spa table entry + * @nfit_spa: spa table to map + * + * In the case where block-data-window apertures and + * dimm-control-regions are interleaved they will end up sharing a + * single request_mem_region() + ioremap() for the address range. In + * the style of devm nfit_spa_map() mappings are automatically dropped + * when all region devices referencing the same mapping are disabled / + * unbound. + */ +static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc, + struct acpi_nfit_system_address *spa) +{ + void __iomem *iomem; + + mutex_lock(&acpi_desc->spa_map_mutex); + iomem = __nfit_spa_map(acpi_desc, spa); + mutex_unlock(&acpi_desc->spa_map_mutex); + + return iomem; +} + +static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio, + struct acpi_nfit_interleave *idt, u16 interleave_ways) +{ + if (idt) { + mmio->num_lines = idt->line_count; + mmio->line_size = idt->line_size; + if (interleave_ways == 0) + return -ENXIO; + mmio->table_size = mmio->num_lines * interleave_ways + * mmio->line_size; + } + + return 0; +} + +static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, + struct device *dev) +{ + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nd_blk_region *ndbr = to_nd_blk_region(dev); + struct nfit_blk_mmio *mmio; + struct nfit_blk *nfit_blk; + struct nfit_mem *nfit_mem; + struct nvdimm *nvdimm; + int rc; + + nvdimm = nd_blk_region_to_dimm(ndbr); + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) { + dev_dbg(dev, "%s: missing%s%s%s\n", __func__, + nfit_mem ? "" : " nfit_mem", + nfit_mem->dcr ? "" : " dcr", + nfit_mem->bdw ? "" : " bdw"); + return -ENXIO; + } + + nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL); + if (!nfit_blk) + return -ENOMEM; + nd_blk_region_set_provider_data(ndbr, nfit_blk); + nfit_blk->nd_region = to_nd_region(dev); + + /* map block aperture memory */ + nfit_blk->bdw_offset = nfit_mem->bdw->offset; + mmio = &nfit_blk->mmio[BDW]; + mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw); + if (!mmio->base) { + dev_dbg(dev, "%s: %s failed to map bdw\n", __func__, + nvdimm_name(nvdimm)); + return -ENOMEM; + } + mmio->size = nfit_mem->bdw->size; + mmio->base_offset = nfit_mem->memdev_bdw->region_offset; + mmio->idt = nfit_mem->idt_bdw; + mmio->spa = nfit_mem->spa_bdw; + rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw, + nfit_mem->memdev_bdw->interleave_ways); + if (rc) { + dev_dbg(dev, "%s: %s failed to init bdw interleave\n", + __func__, nvdimm_name(nvdimm)); + return rc; + } + + /* map block control memory */ + nfit_blk->cmd_offset = nfit_mem->dcr->command_offset; + nfit_blk->stat_offset = nfit_mem->dcr->status_offset; + mmio = &nfit_blk->mmio[DCR]; + mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr); + if (!mmio->base) { + dev_dbg(dev, "%s: %s failed to map dcr\n", __func__, + nvdimm_name(nvdimm)); + return -ENOMEM; + } + mmio->size = nfit_mem->dcr->window_size; + mmio->base_offset = nfit_mem->memdev_dcr->region_offset; + mmio->idt = nfit_mem->idt_dcr; + mmio->spa = nfit_mem->spa_dcr; + rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr, + nfit_mem->memdev_dcr->interleave_ways); + if (rc) { + dev_dbg(dev, "%s: %s failed to init dcr interleave\n", + __func__, nvdimm_name(nvdimm)); + return rc; + } + + if (mmio->line_size == 0) + return 0; + + if ((u32) nfit_blk->cmd_offset % mmio->line_size + + 8 > mmio->line_size) { + dev_dbg(dev, "cmd_offset crosses interleave boundary\n"); + return -ENXIO; + } else if ((u32) nfit_blk->stat_offset % mmio->line_size + + 8 > mmio->line_size) { + dev_dbg(dev, "stat_offset crosses interleave boundary\n"); + return -ENXIO; + } + + return 0; +} + +static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, + struct device *dev) +{ + struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus); + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nd_blk_region *ndbr = to_nd_blk_region(dev); + struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr); + int i; + + if (!nfit_blk) + return; /* never enabled */ + + /* auto-free BLK spa mappings */ + for (i = 0; i < 2; i++) { + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i]; + + if (mmio->base) + nfit_spa_unmap(acpi_desc, mmio->spa); + } + nd_blk_region_set_provider_data(ndbr, NULL); + /* devm will free nfit_blk */ +} + static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc, struct acpi_nfit_memory_map *memdev, @@ -878,6 +1289,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, { struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, memdev->device_handle); + struct nd_blk_region_desc *ndbr_desc; struct nfit_mem *nfit_mem; int blk_valid = 0; @@ -908,6 +1320,10 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, ndr_desc->nd_mapping = nd_mapping; ndr_desc->num_mappings = blk_valid; + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr_desc->enable = acpi_nfit_blk_region_enable; + ndbr_desc->disable = acpi_nfit_blk_region_disable; + ndbr_desc->do_io = acpi_nfit_blk_region_do_io; if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc)) return -ENOMEM; break; @@ -921,8 +1337,9 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, { static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS]; struct acpi_nfit_system_address *spa = nfit_spa->spa; + struct nd_blk_region_desc ndbr_desc; + struct nd_region_desc *ndr_desc; struct nfit_memdev *nfit_memdev; - struct nd_region_desc ndr_desc; struct nvdimm_bus *nvdimm_bus; struct resource res; int count = 0, rc; @@ -935,12 +1352,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, memset(&res, 0, sizeof(res)); memset(&nd_mappings, 0, sizeof(nd_mappings)); - memset(&ndr_desc, 0, sizeof(ndr_desc)); + memset(&ndbr_desc, 0, sizeof(ndbr_desc)); res.start = spa->address; res.end = res.start + spa->length - 1; - ndr_desc.res = &res; - ndr_desc.provider_data = nfit_spa; - ndr_desc.attr_groups = acpi_nfit_region_attribute_groups; + ndr_desc = &ndbr_desc.ndr_desc; + ndr_desc->res = &res; + ndr_desc->provider_data = nfit_spa; + ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping *nd_mapping; @@ -953,24 +1371,24 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, return -ENXIO; } nd_mapping = &nd_mappings[count++]; - rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, &ndr_desc, + rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc, memdev, spa); if (rc) return rc; } - ndr_desc.nd_mapping = nd_mappings; - ndr_desc.num_mappings = count; - rc = acpi_nfit_init_interleave_set(acpi_desc, &ndr_desc, spa); + ndr_desc->nd_mapping = nd_mappings; + ndr_desc->num_mappings = count; + rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa); if (rc) return rc; nvdimm_bus = acpi_desc->nvdimm_bus; if (nfit_spa_type(spa) == NFIT_SPA_PM) { - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) { - if (!nvdimm_volatile_region_create(nvdimm_bus, &ndr_desc)) + if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc)) return -ENOMEM; } return 0; @@ -996,11 +1414,14 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) u8 *data; int rc; + INIT_LIST_HEAD(&acpi_desc->spa_maps); INIT_LIST_HEAD(&acpi_desc->spas); INIT_LIST_HEAD(&acpi_desc->dcrs); INIT_LIST_HEAD(&acpi_desc->bdws); + INIT_LIST_HEAD(&acpi_desc->idts); INIT_LIST_HEAD(&acpi_desc->memdevs); INIT_LIST_HEAD(&acpi_desc->dimms); + mutex_init(&acpi_desc->spa_map_mutex); data = (u8 *) acpi_desc->nfit; end = data + sz; diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index b76e33629098..7bd38b7baf39 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -52,6 +52,11 @@ struct nfit_bdw { struct list_head list; }; +struct nfit_idt { + struct acpi_nfit_interleave *idt; + struct list_head list; +}; + struct nfit_memdev { struct acpi_nfit_memory_map *memdev; struct list_head list; @@ -62,10 +67,13 @@ struct nfit_mem { struct nvdimm *nvdimm; struct acpi_nfit_memory_map *memdev_dcr; struct acpi_nfit_memory_map *memdev_pmem; + struct acpi_nfit_memory_map *memdev_bdw; struct acpi_nfit_control_region *dcr; struct acpi_nfit_data_region *bdw; struct acpi_nfit_system_address *spa_dcr; struct acpi_nfit_system_address *spa_bdw; + struct acpi_nfit_interleave *idt_dcr; + struct acpi_nfit_interleave *idt_bdw; struct list_head list; struct acpi_device *adev; unsigned long dsm_mask; @@ -74,16 +82,57 @@ struct nfit_mem { struct acpi_nfit_desc { struct nvdimm_bus_descriptor nd_desc; struct acpi_table_nfit *nfit; + struct mutex spa_map_mutex; + struct list_head spa_maps; struct list_head memdevs; struct list_head dimms; struct list_head spas; struct list_head dcrs; struct list_head bdws; + struct list_head idts; struct nvdimm_bus *nvdimm_bus; struct device *dev; unsigned long dimm_dsm_force_en; }; +enum nd_blk_mmio_selector { + BDW, + DCR, +}; + +struct nfit_blk { + struct nfit_blk_mmio { + union { + void __iomem *base; + void *aperture; + }; + u64 size; + u64 base_offset; + u32 line_size; + u32 num_lines; + u32 table_size; + struct acpi_nfit_interleave *idt; + struct acpi_nfit_system_address *spa; + } mmio[2]; + struct nd_region *nd_region; + u64 bdw_offset; /* post interleave offset */ + u64 stat_offset; + u64 cmd_offset; +}; + +struct nfit_spa_mapping { + struct acpi_nfit_desc *acpi_desc; + struct acpi_nfit_system_address *spa; + struct list_head list; + struct kref kref; + void __iomem *iomem; +}; + +static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref) +{ + return container_of(kref, struct nfit_spa_mapping, kref); +} + static inline struct acpi_nfit_memory_map *__to_nfit_memdev( struct nfit_mem *nfit_mem) { diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 204ee0796411..72226acb5c0f 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -34,6 +34,19 @@ config BLK_DEV_PMEM Say Y if you want to use an NVDIMM +config ND_BLK + tristate "BLK: Block data window (aperture) device support" + default LIBNVDIMM + select ND_BTT if BTT + help + Support NVDIMMs, or other devices, that implement a BLK-mode + access capability. BLK-mode access uses memory-mapped-i/o + apertures to access persistent media. + + Say Y if your platform firmware emits an ACPI.NFIT table + (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode + capabilities. + config ND_BTT tristate diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index d2aab6c58492..594bb97c867a 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -1,11 +1,14 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_ND_BLK) += nd_blk.o nd_pmem-y := pmem.o nd_btt-y := btt.o +nd_blk-y := blk.o + libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c new file mode 100644 index 000000000000..9ac0c266c15c --- /dev/null +++ b/drivers/nvdimm/blk.c @@ -0,0 +1,245 @@ +/* + * NVDIMM Block Window Driver + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "nd.h" + +struct nd_blk_device { + struct request_queue *queue; + struct gendisk *disk; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + size_t disk_size; +}; + +static int nd_blk_major; + +static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk, + resource_size_t ns_offset, unsigned int len) +{ + int i; + + for (i = 0; i < nsblk->num_resources; i++) { + if (ns_offset < resource_size(nsblk->res[i])) { + if (ns_offset + len > resource_size(nsblk->res[i])) { + dev_WARN_ONCE(&nsblk->common.dev, 1, + "illegal request\n"); + return SIZE_MAX; + } + return nsblk->res[i]->start + ns_offset; + } + ns_offset -= resource_size(nsblk->res[i]); + } + + dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n"); + return SIZE_MAX; +} + +static void nd_blk_make_request(struct request_queue *q, struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct gendisk *disk = bdev->bd_disk; + struct nd_namespace_blk *nsblk; + struct nd_blk_device *blk_dev; + struct nd_blk_region *ndbr; + struct bvec_iter iter; + struct bio_vec bvec; + int err = 0, rw; + + blk_dev = disk->private_data; + nsblk = blk_dev->nsblk; + ndbr = blk_dev->ndbr; + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + resource_size_t dev_offset; + void *iobuf; + + BUG_ON(len > PAGE_SIZE); + + dev_offset = to_dev_offset(nsblk, + iter.bi_sector << SECTOR_SHIFT, len); + if (dev_offset == SIZE_MAX) { + err = -EIO; + goto out; + } + + iobuf = kmap_atomic(bvec.bv_page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + bvec.bv_offset, + len, rw); + kunmap_atomic(iobuf); + if (err) + goto out; + } + + out: + bio_endio(bio, err); +} + +static int nd_blk_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *iobuf, size_t n, int rw) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim); + struct nd_namespace_blk *nsblk = blk_dev->nsblk; + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset; + + dev_offset = to_dev_offset(nsblk, offset, n); + + if (unlikely(offset + n > blk_dev->disk_size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (dev_offset == SIZE_MAX) + return -EIO; + + return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw); +} + +static const struct block_device_operations nd_blk_fops = { + .owner = THIS_MODULE, +}; + +static int nd_blk_attach_disk(struct nd_namespace_common *ndns, + struct nd_blk_device *blk_dev) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev); + struct gendisk *disk; + + blk_dev->queue = blk_alloc_queue(GFP_KERNEL); + if (!blk_dev->queue) + return -ENOMEM; + + blk_queue_make_request(blk_dev->queue, nd_blk_make_request); + blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX); + blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY); + blk_queue_logical_block_size(blk_dev->queue, nsblk->lbasize); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue); + + disk = blk_dev->disk = alloc_disk(0); + if (!disk) { + blk_cleanup_queue(blk_dev->queue); + return -ENOMEM; + } + + disk->driverfs_dev = &ndns->dev; + disk->major = nd_blk_major; + disk->first_minor = 0; + disk->fops = &nd_blk_fops; + disk->private_data = blk_dev; + disk->queue = blk_dev->queue; + disk->flags = GENHD_FL_EXT_DEVT; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + set_capacity(disk, blk_dev->disk_size >> SECTOR_SHIFT); + add_disk(disk); + + return 0; +} + +static int nd_blk_probe(struct device *dev) +{ + struct nd_namespace_common *ndns; + struct nd_blk_device *blk_dev; + int rc; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + + blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL); + if (!blk_dev) + return -ENOMEM; + + blk_dev->disk_size = nvdimm_namespace_capacity(ndns); + blk_dev->ndbr = to_nd_blk_region(dev->parent); + blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev); + dev_set_drvdata(dev, blk_dev); + + ndns->rw_bytes = nd_blk_rw_bytes; + if (is_nd_btt(dev)) + rc = nvdimm_namespace_attach_btt(ndns); + else if (nd_btt_probe(ndns, blk_dev) == 0) { + /* we'll come back as btt-blk */ + rc = -ENXIO; + } else + rc = nd_blk_attach_disk(ndns, blk_dev); + if (rc) + kfree(blk_dev); + return rc; +} + +static void nd_blk_detach_disk(struct nd_blk_device *blk_dev) +{ + del_gendisk(blk_dev->disk); + put_disk(blk_dev->disk); + blk_cleanup_queue(blk_dev->queue); +} + +static int nd_blk_remove(struct device *dev) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(dev); + + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns); + else + nd_blk_detach_disk(blk_dev); + kfree(blk_dev); + + return 0; +} + +static struct nd_device_driver nd_blk_driver = { + .probe = nd_blk_probe, + .remove = nd_blk_remove, + .drv = { + .name = "nd_blk", + }, + .type = ND_DRIVER_NAMESPACE_BLK, +}; + +static int __init nd_blk_init(void) +{ + int rc; + + rc = register_blkdev(0, "nd_blk"); + if (rc < 0) + return rc; + + nd_blk_major = rc; + rc = nd_driver_register(&nd_blk_driver); + + if (rc < 0) + unregister_blkdev(nd_blk_major, "nd_blk"); + + return rc; +} + +static void __exit nd_blk_exit(void) +{ + driver_unregister(&nd_blk_driver.drv); + unregister_blkdev(nd_blk_major, "nd_blk"); +} + +MODULE_AUTHOR("Ross Zwisler "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK); +module_init(nd_blk_init); +module_exit(nd_blk_exit); diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c index 83b179ed6d61..c05eb807d674 100644 --- a/drivers/nvdimm/dimm_devs.c +++ b/drivers/nvdimm/dimm_devs.c @@ -209,6 +209,15 @@ struct nvdimm *to_nvdimm(struct device *dev) } EXPORT_SYMBOL_GPL(to_nvdimm); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr) +{ + struct nd_region *nd_region = &ndbr->nd_region; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + return nd_mapping->nvdimm; +} +EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm); + struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) { struct nvdimm *nvdimm = nd_mapping->nvdimm; diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 4aa647c8d644..1ce1e70de44a 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -173,6 +173,65 @@ static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) return size; } +static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + int count, i; + + if (!nsblk->uuid || !nsblk->lbasize || !ndd) + return false; + + count = 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + /* + * Resources with unacknoweldged adjustments indicate a + * failure to update labels + */ + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + count++; + } + + /* These values match after a successful label update */ + if (count != nsblk->num_resources) + return false; + + for (i = 0; i < nsblk->num_resources; i++) { + struct resource *found = NULL; + + for_each_dpa_resource(ndd, res) + if (res == nsblk->res[i]) { + found = res; + break; + } + /* stale resource */ + if (!found) + return false; + } + + return true; +} + +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + resource_size_t size; + + nvdimm_bus_lock(&nsblk->common.dev); + size = __nd_namespace_blk_validate(nsblk); + nvdimm_bus_unlock(&nsblk->common.dev); + + return size; +} +EXPORT_SYMBOL(nd_namespace_blk_validate); + + static int nd_namespace_label_update(struct nd_region *nd_region, struct device *dev) { @@ -1224,7 +1283,11 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) return ERR_PTR(-ENODEV); } } else if (is_namespace_blk(&ndns->dev)) { - return ERR_PTR(-ENODEV); /* TODO */ + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + if (!nd_namespace_blk_validate(nsblk)) + return ERR_PTR(-ENODEV); } return ndns; diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 5e6413964776..e1970c71ad1c 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -43,9 +43,8 @@ struct nvdimm { }; bool is_nvdimm(struct device *dev); -bool is_nd_blk(struct device *dev); bool is_nd_pmem(struct device *dev); -struct nd_btt; +bool is_nd_blk(struct device *dev); struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 1b937c235913..f153f43ca3d6 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -102,6 +102,15 @@ struct nd_region { struct nd_mapping mapping[0]; }; +struct nd_blk_region { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + void *blk_provider_data; + struct nd_region nd_region; +}; + /* * Lookup next in the repeating sequence of 01, 10, and 11. */ @@ -171,8 +180,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) #endif struct nd_region *to_nd_region(struct device *dev); -unsigned int nd_region_acquire_lane(struct nd_region *nd_region); -void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); int nd_region_to_nstype(struct nd_region *nd_region); int nd_region_register_namespaces(struct nd_region *nd_region, int *err); u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); @@ -192,4 +199,6 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, char *name); +int nd_blk_region_init(struct nd_region *nd_region); +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk); #endif /* __ND_H__ */ diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c index eb8aebcd4800..f28f78ccff19 100644 --- a/drivers/nvdimm/region.c +++ b/drivers/nvdimm/region.c @@ -18,11 +18,10 @@ static int nd_region_probe(struct device *dev) { - int err; + int err, rc; static unsigned long once; struct nd_region_namespaces *num_ns; struct nd_region *nd_region = to_nd_region(dev); - int rc = nd_region_register_namespaces(nd_region, &err); if (nd_region->num_lanes > num_online_cpus() && nd_region->num_lanes < num_possible_cpus() @@ -34,6 +33,11 @@ static int nd_region_probe(struct device *dev) nd_region->num_lanes); } + rc = nd_blk_region_init(nd_region); + if (rc) + return rc; + + rc = nd_region_register_namespaces(nd_region, &err); num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); if (!num_ns) return -ENOMEM; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index fe8ec21fe3d7..2cfb3f74bcbf 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #include +#include #include #include #include @@ -34,7 +35,10 @@ static void nd_region_release(struct device *dev) } free_percpu(nd_region->lane); ida_simple_remove(®ion_ida, nd_region->id); - kfree(nd_region); + if (is_nd_blk(dev)) + kfree(to_nd_blk_region(dev)); + else + kfree(nd_region); } static struct device_type nd_blk_device_type = { @@ -71,6 +75,33 @@ struct nd_region *to_nd_region(struct device *dev) } EXPORT_SYMBOL_GPL(to_nd_region); +struct nd_blk_region *to_nd_blk_region(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + WARN_ON(!is_nd_blk(dev)); + return container_of(nd_region, struct nd_blk_region, nd_region); +} +EXPORT_SYMBOL_GPL(to_nd_blk_region); + +void *nd_region_provider_data(struct nd_region *nd_region) +{ + return nd_region->provider_data; +} +EXPORT_SYMBOL_GPL(nd_region_provider_data); + +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr) +{ + return ndbr->blk_provider_data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_provider_data); + +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data) +{ + ndbr->blk_provider_data = data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data); + /** * nd_region_to_nstype() - region to an integer namespace type * @nd_region: region-device to interrogate @@ -365,7 +396,8 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) /* * Upon successful probe/remove, take/release a reference on the * associated interleave set (if present), and plant new btt + namespace - * seeds. + * seeds. Also, on the removal of a BLK region, notify the provider to + * disable the region. */ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, struct device *dev, bool probe) @@ -385,8 +417,14 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, nd_mapping->labels = NULL; put_ndd(ndd); nd_mapping->ndd = NULL; - atomic_dec(&nvdimm->busy); + if (ndd) + atomic_dec(&nvdimm->busy); } + + if (is_nd_pmem(dev)) + return; + + to_nd_blk_region(dev)->disable(nvdimm_bus, dev); } if (dev->parent && is_nd_blk(dev->parent) && probe) { nd_region = to_nd_region(dev->parent); @@ -526,11 +564,21 @@ struct attribute_group nd_mapping_attribute_group = { }; EXPORT_SYMBOL_GPL(nd_mapping_attribute_group); -void *nd_region_provider_data(struct nd_region *nd_region) +int nd_blk_region_init(struct nd_region *nd_region) { - return nd_region->provider_data; + struct device *dev = &nd_region->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!is_nd_blk(dev)) + return 0; + + if (nd_region->ndr_mappings < 1) { + dev_err(dev, "invalid BLK region\n"); + return -ENXIO; + } + + return to_nd_blk_region(dev)->enable(nvdimm_bus, dev); } -EXPORT_SYMBOL_GPL(nd_region_provider_data); /** * nd_region_acquire_lane - allocate and lock a lane @@ -591,6 +639,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, { struct nd_region *nd_region; struct device *dev; + void *region_buf; unsigned int i; for (i = 0; i < ndr_desc->num_mappings; i++) { @@ -605,10 +654,30 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, } } - nd_region = kzalloc(sizeof(struct nd_region) - + sizeof(struct nd_mapping) * ndr_desc->num_mappings, - GFP_KERNEL); - if (!nd_region) + if (dev_type == &nd_blk_device_type) { + struct nd_blk_region_desc *ndbr_desc; + struct nd_blk_region *ndbr; + + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + if (ndbr) { + nd_region = &ndbr->nd_region; + ndbr->enable = ndbr_desc->enable; + ndbr->disable = ndbr_desc->disable; + ndbr->do_io = ndbr_desc->do_io; + } + region_buf = ndbr; + } else { + nd_region = kzalloc(sizeof(struct nd_region) + + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + region_buf = nd_region; + } + + if (!region_buf) return NULL; nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); if (nd_region->id < 0) @@ -654,7 +723,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, err_percpu: ida_simple_remove(®ion_ida, nd_region->id); err_id: - kfree(nd_region); + kfree(region_buf); return NULL; } diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 531d99dfac68..7fc1b25bdb5d 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -14,6 +14,7 @@ */ #ifndef __LIBNVDIMM_H__ #define __LIBNVDIMM_H__ +#include #include #include @@ -89,8 +90,24 @@ struct nd_region_desc { }; struct nvdimm_bus; -struct device; struct module; +struct device; +struct nd_blk_region; +struct nd_blk_region_desc { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + struct nd_region_desc ndr_desc; +}; + +static inline struct nd_blk_region_desc *to_blk_region_desc( + struct nd_region_desc *ndr_desc) +{ + return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc); + +} + struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc, struct module *module); #define nvdimm_bus_register(parent, desc) \ @@ -99,10 +116,10 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); struct nvdimm_bus *to_nvdimm_bus(struct device *dev); struct nvdimm *to_nvdimm(struct device *dev); struct nd_region *to_nd_region(struct device *dev); +struct nd_blk_region *to_nd_blk_region(struct device *dev); struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus); const char *nvdimm_name(struct nvdimm *nvdimm); void *nvdimm_provider_data(struct nvdimm *nvdimm); -void *nd_region_provider_data(struct nd_region *nd_region); struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, const struct attribute_group **groups, unsigned long flags, unsigned long *dsm_mask); @@ -120,5 +137,11 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, struct nd_region_desc *ndr_desc); +void *nd_region_provider_data(struct nd_region *nd_region); +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr); +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data); +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr); +unsigned int nd_region_acquire_lane(struct nd_region *nd_region); +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane); u64 nd_fletcher64(void *addr, size_t len, bool le); #endif /* __LIBNVDIMM_H__ */ -- cgit v1.2.3 From 6bc756193ff61bf5e7b3cfedfbb0873bf40f8055 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 17 Jun 2015 17:23:32 -0400 Subject: tools/testing/nvdimm: libnvdimm unit test infrastructure 'libnvdimm' is the first driver sub-system in the kernel to implement mocking for unit test coverage. The nfit_test module gets built as an external module and arranges for external module replacements of nfit, libnvdimm, nd_pmem, and nd_blk. These replacements use the linker --wrap option to redirect calls to ioremap() + request_mem_region() to custom defined unit test resources. The end result is a fully functional nvdimm_bus, as far as userspace is concerned, but with the capability to perform otherwise destructive tests on emulated resources. Q: Why not use QEMU for this emulation? QEMU is not suitable for unit testing. QEMU's role is to faithfully emulate the platform. A unit test's role is to unfaithfully implement the platform with the goal of triggering bugs in the corners of the sub-system implementation. As bugs are discovered in platforms, or the sub-system itself, the unit tests are extended to backstop a fix with a reproducer unit test. Another problem with QEMU is that it would require coordination of 3 software projects instead of 2 (kernel + libndctl [1]) to maintain and execute the tests. The chances for bit rot and the difficulty of getting the tests running goes up non-linearly the more components involved. Q: Why submit this to the kernel tree instead of external modules in libndctl? Simple, to alleviate the same risk that out-of-tree external modules face. Updates to drivers/nvdimm/ can be immediately evaluated to see if they have any impact on tools/testing/nvdimm/. Q: What are the negative implications of merging this? It is a unique maintenance burden because the purpose of mocking an interface to enable a unit test is to purposefully short circuit the semantics of a routine to enable testing. For example __wrap_ioremap_cache() fakes the pmem driver into "ioremap()'ing" a test resource buffer allocated by dma_alloc_coherent(). The future maintenance burden hits when someone changes the semantics of ioremap_cache() and wonders what the implications are for the unit test. [1]: https://github.com/pmem/ndctl Cc: Cc: Lv Zheng Cc: Robert Moore Cc: Rafael J. Wysocki Cc: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 12 +- drivers/acpi/nfit.h | 6 + tools/testing/nvdimm/Kbuild | 40 ++ tools/testing/nvdimm/Makefile | 7 + tools/testing/nvdimm/config_check.c | 15 + tools/testing/nvdimm/test/Kbuild | 8 + tools/testing/nvdimm/test/iomap.c | 151 +++++ tools/testing/nvdimm/test/nfit.c | 1113 +++++++++++++++++++++++++++++++++ tools/testing/nvdimm/test/nfit_test.h | 29 + 9 files changed, 1377 insertions(+), 4 deletions(-) create mode 100644 tools/testing/nvdimm/Kbuild create mode 100644 tools/testing/nvdimm/Makefile create mode 100644 tools/testing/nvdimm/config_check.c create mode 100644 tools/testing/nvdimm/test/Kbuild create mode 100644 tools/testing/nvdimm/test/iomap.c create mode 100644 tools/testing/nvdimm/test/nfit.c create mode 100644 tools/testing/nvdimm/test/nfit_test.h (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 2aa6aa97b40c..07d630e9f4ae 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -33,10 +33,11 @@ MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status"); static u8 nfit_uuid[NFIT_UUID_MAX][16]; -static const u8 *to_nfit_uuid(enum nfit_uuids id) +const u8 *to_nfit_uuid(enum nfit_uuids id) { return nfit_uuid[id]; } +EXPORT_SYMBOL(to_nfit_uuid); static struct acpi_nfit_desc *to_acpi_nfit_desc( struct nvdimm_bus_descriptor *nd_desc) @@ -581,11 +582,12 @@ static struct attribute_group acpi_nfit_attribute_group = { .attrs = acpi_nfit_attributes, }; -static const struct attribute_group *acpi_nfit_attribute_groups[] = { +const struct attribute_group *acpi_nfit_attribute_groups[] = { &nvdimm_bus_attribute_group, &acpi_nfit_attribute_group, NULL, }; +EXPORT_SYMBOL_GPL(acpi_nfit_attribute_groups); static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev) { @@ -1323,7 +1325,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc, ndbr_desc = to_blk_region_desc(ndr_desc); ndbr_desc->enable = acpi_nfit_blk_region_enable; ndbr_desc->disable = acpi_nfit_blk_region_disable; - ndbr_desc->do_io = acpi_nfit_blk_region_do_io; + ndbr_desc->do_io = acpi_desc->blk_do_io; if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc)) return -ENOMEM; break; @@ -1407,7 +1409,7 @@ static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc) return 0; } -static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) +int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) { struct device *dev = acpi_desc->dev; const void *end; @@ -1446,6 +1448,7 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz) return acpi_nfit_register_regions(acpi_desc); } +EXPORT_SYMBOL_GPL(acpi_nfit_init); static int acpi_nfit_add(struct acpi_device *adev) { @@ -1470,6 +1473,7 @@ static int acpi_nfit_add(struct acpi_device *adev) dev_set_drvdata(dev, acpi_desc); acpi_desc->dev = dev; acpi_desc->nfit = (struct acpi_table_nfit *) tbl; + acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io; nd_desc = &acpi_desc->nd_desc; nd_desc->provider_name = "ACPI.NFIT"; nd_desc->ndctl = acpi_nfit_ctl; diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index 7bd38b7baf39..c62fffea8423 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -93,6 +93,8 @@ struct acpi_nfit_desc { struct nvdimm_bus *nvdimm_bus; struct device *dev; unsigned long dimm_dsm_force_en; + int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); }; enum nd_blk_mmio_selector { @@ -146,4 +148,8 @@ static inline struct acpi_nfit_desc *to_acpi_desc( { return container_of(nd_desc, struct acpi_nfit_desc, nd_desc); } + +const u8 *to_nfit_uuid(enum nfit_uuids id); +int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz); +extern const struct attribute_group *acpi_nfit_attribute_groups[]; #endif /* __NFIT_H__ */ diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild new file mode 100644 index 000000000000..8e9b64520ec1 --- /dev/null +++ b/tools/testing/nvdimm/Kbuild @@ -0,0 +1,40 @@ +ldflags-y += --wrap=ioremap_cache +ldflags-y += --wrap=ioremap_nocache +ldflags-y += --wrap=iounmap +ldflags-y += --wrap=__request_region +ldflags-y += --wrap=__release_region + +DRIVERS := ../../../drivers +NVDIMM_SRC := $(DRIVERS)/nvdimm +ACPI_SRC := $(DRIVERS)/acpi + +obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o +obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_ND_BLK) += nd_blk.o +obj-$(CONFIG_ACPI_NFIT) += nfit.o + +nfit-y := $(ACPI_SRC)/nfit.o +nfit-y += config_check.o + +nd_pmem-y := $(NVDIMM_SRC)/pmem.o +nd_pmem-y += config_check.o + +nd_btt-y := $(NVDIMM_SRC)/btt.o +nd_btt-y += config_check.o + +nd_blk-y := $(NVDIMM_SRC)/blk.o +nd_blk-y += config_check.o + +libnvdimm-y := $(NVDIMM_SRC)/core.o +libnvdimm-y += $(NVDIMM_SRC)/bus.o +libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o +libnvdimm-y += $(NVDIMM_SRC)/dimm.o +libnvdimm-y += $(NVDIMM_SRC)/region_devs.o +libnvdimm-y += $(NVDIMM_SRC)/region.o +libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o +libnvdimm-y += $(NVDIMM_SRC)/label.o +libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o +libnvdimm-y += config_check.o + +obj-m += test/ diff --git a/tools/testing/nvdimm/Makefile b/tools/testing/nvdimm/Makefile new file mode 100644 index 000000000000..3dfe024b4e7e --- /dev/null +++ b/tools/testing/nvdimm/Makefile @@ -0,0 +1,7 @@ +KDIR ?= ../../../ + +default: + $(MAKE) -C $(KDIR) M=$$PWD + +install: default + $(MAKE) -C $(KDIR) M=$$PWD modules_install diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c new file mode 100644 index 000000000000..f2c7615554eb --- /dev/null +++ b/tools/testing/nvdimm/config_check.c @@ -0,0 +1,15 @@ +#include +#include + +void check(void) +{ + /* + * These kconfig symbols must be set to "m" for nfit_test to + * load and operate. + */ + BUILD_BUG_ON(!IS_MODULE(CONFIG_LIBNVDIMM)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_BLK_DEV_PMEM)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK)); + BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT)); +} diff --git a/tools/testing/nvdimm/test/Kbuild b/tools/testing/nvdimm/test/Kbuild new file mode 100644 index 000000000000..9241064970fe --- /dev/null +++ b/tools/testing/nvdimm/test/Kbuild @@ -0,0 +1,8 @@ +ccflags-y := -I$(src)/../../../../drivers/nvdimm/ +ccflags-y += -I$(src)/../../../../drivers/acpi/ + +obj-m += nfit_test.o +obj-m += nfit_test_iomap.o + +nfit_test-y := nfit.o +nfit_test_iomap-y := iomap.o diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c new file mode 100644 index 000000000000..c85a6f6ba559 --- /dev/null +++ b/tools/testing/nvdimm/test/iomap.c @@ -0,0 +1,151 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include "nfit_test.h" + +static LIST_HEAD(iomap_head); + +static struct iomap_ops { + nfit_test_lookup_fn nfit_test_lookup; + struct list_head list; +} iomap_ops = { + .list = LIST_HEAD_INIT(iomap_ops.list), +}; + +void nfit_test_setup(nfit_test_lookup_fn lookup) +{ + iomap_ops.nfit_test_lookup = lookup; + list_add_rcu(&iomap_ops.list, &iomap_head); +} +EXPORT_SYMBOL(nfit_test_setup); + +void nfit_test_teardown(void) +{ + list_del_rcu(&iomap_ops.list); + synchronize_rcu(); +} +EXPORT_SYMBOL(nfit_test_teardown); + +static struct nfit_test_resource *get_nfit_res(resource_size_t resource) +{ + struct iomap_ops *ops; + + ops = list_first_or_null_rcu(&iomap_head, typeof(*ops), list); + if (ops) + return ops->nfit_test_lookup(resource); + return NULL; +} + +void __iomem *__nfit_test_ioremap(resource_size_t offset, unsigned long size, + void __iomem *(*fallback_fn)(resource_size_t, unsigned long)) +{ + struct nfit_test_resource *nfit_res; + + rcu_read_lock(); + nfit_res = get_nfit_res(offset); + rcu_read_unlock(); + if (nfit_res) + return (void __iomem *) nfit_res->buf + offset + - nfit_res->res->start; + return fallback_fn(offset, size); +} + +void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size) +{ + return __nfit_test_ioremap(offset, size, ioremap_cache); +} +EXPORT_SYMBOL(__wrap_ioremap_cache); + +void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size) +{ + return __nfit_test_ioremap(offset, size, ioremap_nocache); +} +EXPORT_SYMBOL(__wrap_ioremap_nocache); + +void __wrap_iounmap(volatile void __iomem *addr) +{ + struct nfit_test_resource *nfit_res; + + rcu_read_lock(); + nfit_res = get_nfit_res((unsigned long) addr); + rcu_read_unlock(); + if (nfit_res) + return; + return iounmap(addr); +} +EXPORT_SYMBOL(__wrap_iounmap); + +struct resource *__wrap___request_region(struct resource *parent, + resource_size_t start, resource_size_t n, const char *name, + int flags) +{ + struct nfit_test_resource *nfit_res; + + if (parent == &iomem_resource) { + rcu_read_lock(); + nfit_res = get_nfit_res(start); + rcu_read_unlock(); + if (nfit_res) { + struct resource *res = nfit_res->res + 1; + + if (start + n > nfit_res->res->start + + resource_size(nfit_res->res)) { + pr_debug("%s: start: %llx n: %llx overflow: %pr\n", + __func__, start, n, + nfit_res->res); + return NULL; + } + + res->start = start; + res->end = start + n - 1; + res->name = name; + res->flags = resource_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + pr_debug("%s: %pr\n", __func__, res); + return res; + } + } + return __request_region(parent, start, n, name, flags); +} +EXPORT_SYMBOL(__wrap___request_region); + +void __wrap___release_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct nfit_test_resource *nfit_res; + + if (parent == &iomem_resource) { + rcu_read_lock(); + nfit_res = get_nfit_res(start); + rcu_read_unlock(); + if (nfit_res) { + struct resource *res = nfit_res->res + 1; + + if (start != res->start || resource_size(res) != n) + pr_info("%s: start: %llx n: %llx mismatch: %pr\n", + __func__, start, n, res); + else + memset(res, 0, sizeof(*res)); + return; + } + } + __release_region(parent, start, n); +} +EXPORT_SYMBOL(__wrap___release_region); + +MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c new file mode 100644 index 000000000000..7a4a5a5edbe4 --- /dev/null +++ b/tools/testing/nvdimm/test/nfit.c @@ -0,0 +1,1113 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nfit_test.h" + +/* + * Generate an NFIT table to describe the following topology: + * + * BUS0: Interleaved PMEM regions, and aliasing with BLK regions + * + * (a) (b) DIMM BLK-REGION + * +----------+--------------+----------+---------+ + * +------+ | blk2.0 | pm0.0 | blk2.1 | pm1.0 | 0 region2 + * | imc0 +--+- - - - - region0 - - - -+----------+ + + * +--+---+ | blk3.0 | pm0.0 | blk3.1 | pm1.0 | 1 region3 + * | +----------+--------------v----------v v + * +--+---+ | | + * | cpu0 | region1 + * +--+---+ | | + * | +-------------------------^----------^ ^ + * +--+---+ | blk4.0 | pm1.0 | 2 region4 + * | imc1 +--+-------------------------+----------+ + + * +------+ | blk5.0 | pm1.0 | 3 region5 + * +-------------------------+----------+-+-------+ + * + * *) In this layout we have four dimms and two memory controllers in one + * socket. Each unique interface (BLK or PMEM) to DPA space + * is identified by a region device with a dynamically assigned id. + * + * *) The first portion of dimm0 and dimm1 are interleaved as REGION0. + * A single PMEM namespace "pm0.0" is created using half of the + * REGION0 SPA-range. REGION0 spans dimm0 and dimm1. PMEM namespace + * allocate from from the bottom of a region. The unallocated + * portion of REGION0 aliases with REGION2 and REGION3. That + * unallacted capacity is reclaimed as BLK namespaces ("blk2.0" and + * "blk3.0") starting at the base of each DIMM to offset (a) in those + * DIMMs. "pm0.0", "blk2.0" and "blk3.0" are free-form readable + * names that can be assigned to a namespace. + * + * *) In the last portion of dimm0 and dimm1 we have an interleaved + * SPA range, REGION1, that spans those two dimms as well as dimm2 + * and dimm3. Some of REGION1 allocated to a PMEM namespace named + * "pm1.0" the rest is reclaimed in 4 BLK namespaces (for each + * dimm in the interleave set), "blk2.1", "blk3.1", "blk4.0", and + * "blk5.0". + * + * *) The portion of dimm2 and dimm3 that do not participate in the + * REGION1 interleaved SPA range (i.e. the DPA address below offset + * (b) are also included in the "blk4.0" and "blk5.0" namespaces. + * Note, that BLK namespaces need not be contiguous in DPA-space, and + * can consume aliased capacity from multiple interleave sets. + * + * BUS1: Legacy NVDIMM (single contiguous range) + * + * region2 + * +---------------------+ + * |---------------------| + * || pm2.0 || + * |---------------------| + * +---------------------+ + * + * *) A NFIT-table may describe a simple system-physical-address range + * with no BLK aliasing. This type of region may optionally + * reference an NVDIMM. + */ +enum { + NUM_PM = 2, + NUM_DCR = 4, + NUM_BDW = NUM_DCR, + NUM_SPA = NUM_PM + NUM_DCR + NUM_BDW, + NUM_MEM = NUM_DCR + NUM_BDW + 2 /* spa0 iset */ + 4 /* spa1 iset */, + DIMM_SIZE = SZ_32M, + LABEL_SIZE = SZ_128K, + SPA0_SIZE = DIMM_SIZE, + SPA1_SIZE = DIMM_SIZE*2, + SPA2_SIZE = DIMM_SIZE, + BDW_SIZE = 64 << 8, + DCR_SIZE = 12, + NUM_NFITS = 2, /* permit testing multiple NFITs per system */ +}; + +struct nfit_test_dcr { + __le64 bdw_addr; + __le32 bdw_status; + __u8 aperature[BDW_SIZE]; +}; + +#define NFIT_DIMM_HANDLE(node, socket, imc, chan, dimm) \ + (((node & 0xfff) << 16) | ((socket & 0xf) << 12) \ + | ((imc & 0xf) << 8) | ((chan & 0xf) << 4) | (dimm & 0xf)) + +static u32 handle[NUM_DCR] = { + [0] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 0), + [1] = NFIT_DIMM_HANDLE(0, 0, 0, 0, 1), + [2] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 0), + [3] = NFIT_DIMM_HANDLE(0, 0, 1, 0, 1), +}; + +struct nfit_test { + struct acpi_nfit_desc acpi_desc; + struct platform_device pdev; + struct list_head resources; + void *nfit_buf; + dma_addr_t nfit_dma; + size_t nfit_size; + int num_dcr; + int num_pm; + void **dimm; + dma_addr_t *dimm_dma; + void **label; + dma_addr_t *label_dma; + void **spa_set; + dma_addr_t *spa_set_dma; + struct nfit_test_dcr **dcr; + dma_addr_t *dcr_dma; + int (*alloc)(struct nfit_test *t); + void (*setup)(struct nfit_test *t); +}; + +static struct nfit_test *to_nfit_test(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + + return container_of(pdev, struct nfit_test, pdev); +} + +static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len) +{ + struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc); + struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc); + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + int i, rc; + + if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask)) + return -ENXIO; + + /* lookup label space for the given dimm */ + for (i = 0; i < ARRAY_SIZE(handle); i++) + if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i]) + break; + if (i >= ARRAY_SIZE(handle)) + return -ENXIO; + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: { + struct nd_cmd_get_config_size *nd_cmd = buf; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + nd_cmd->status = 0; + nd_cmd->config_size = LABEL_SIZE; + nd_cmd->max_xfer = SZ_4K; + rc = 0; + break; + } + case ND_CMD_GET_CONFIG_DATA: { + struct nd_cmd_get_config_data_hdr *nd_cmd = buf; + unsigned int len, offset = nd_cmd->in_offset; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len) + return -EINVAL; + + nd_cmd->status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(nd_cmd->out_buf, t->label[i] + offset, len); + rc = buf_len - sizeof(*nd_cmd) - len; + break; + } + case ND_CMD_SET_CONFIG_DATA: { + struct nd_cmd_set_config_hdr *nd_cmd = buf; + unsigned int len, offset = nd_cmd->in_offset; + u32 *status; + + if (buf_len < sizeof(*nd_cmd)) + return -EINVAL; + if (offset >= LABEL_SIZE) + return -EINVAL; + if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len) + return -EINVAL; + + status = buf + nd_cmd->in_length + sizeof(*nd_cmd); + *status = 0; + len = min(nd_cmd->in_length, LABEL_SIZE - offset); + memcpy(t->label[i] + offset, nd_cmd->in_buf, len); + rc = buf_len - sizeof(*nd_cmd) - (len + 4); + break; + } + default: + return -ENOTTY; + } + + return rc; +} + +static DEFINE_SPINLOCK(nfit_test_lock); +static struct nfit_test *instances[NUM_NFITS]; + +static void release_nfit_res(void *data) +{ + struct nfit_test_resource *nfit_res = data; + struct resource *res = nfit_res->res; + + spin_lock(&nfit_test_lock); + list_del(&nfit_res->list); + spin_unlock(&nfit_test_lock); + + if (is_vmalloc_addr(nfit_res->buf)) + vfree(nfit_res->buf); + else + dma_free_coherent(nfit_res->dev, resource_size(res), + nfit_res->buf, res->start); + kfree(res); + kfree(nfit_res); +} + +static void *__test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma, + void *buf) +{ + struct device *dev = &t->pdev.dev; + struct resource *res = kzalloc(sizeof(*res) * 2, GFP_KERNEL); + struct nfit_test_resource *nfit_res = kzalloc(sizeof(*nfit_res), + GFP_KERNEL); + int rc; + + if (!res || !buf || !nfit_res) + goto err; + rc = devm_add_action(dev, release_nfit_res, nfit_res); + if (rc) + goto err; + INIT_LIST_HEAD(&nfit_res->list); + memset(buf, 0, size); + nfit_res->dev = dev; + nfit_res->buf = buf; + nfit_res->res = res; + res->start = *dma; + res->end = *dma + size - 1; + res->name = "NFIT"; + spin_lock(&nfit_test_lock); + list_add(&nfit_res->list, &t->resources); + spin_unlock(&nfit_test_lock); + + return nfit_res->buf; + err: + if (buf && !is_vmalloc_addr(buf)) + dma_free_coherent(dev, size, buf, *dma); + else if (buf) + vfree(buf); + kfree(res); + kfree(nfit_res); + return NULL; +} + +static void *test_alloc(struct nfit_test *t, size_t size, dma_addr_t *dma) +{ + void *buf = vmalloc(size); + + *dma = (unsigned long) buf; + return __test_alloc(t, size, dma, buf); +} + +static void *test_alloc_coherent(struct nfit_test *t, size_t size, + dma_addr_t *dma) +{ + struct device *dev = &t->pdev.dev; + void *buf = dma_alloc_coherent(dev, size, dma, GFP_KERNEL); + + return __test_alloc(t, size, dma, buf); +} + +static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(instances); i++) { + struct nfit_test_resource *n, *nfit_res = NULL; + struct nfit_test *t = instances[i]; + + if (!t) + continue; + spin_lock(&nfit_test_lock); + list_for_each_entry(n, &t->resources, list) { + if (addr >= n->res->start && (addr < n->res->start + + resource_size(n->res))) { + nfit_res = n; + break; + } else if (addr >= (unsigned long) n->buf + && (addr < (unsigned long) n->buf + + resource_size(n->res))) { + nfit_res = n; + break; + } + } + spin_unlock(&nfit_test_lock); + if (nfit_res) + return nfit_res; + } + + return NULL; +} + +static int nfit_test0_alloc(struct nfit_test *t) +{ + size_t nfit_size = sizeof(struct acpi_table_nfit) + + sizeof(struct acpi_nfit_system_address) * NUM_SPA + + sizeof(struct acpi_nfit_memory_map) * NUM_MEM + + sizeof(struct acpi_nfit_control_region) * NUM_DCR + + sizeof(struct acpi_nfit_data_region) * NUM_BDW; + int i; + + t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma); + if (!t->nfit_buf) + return -ENOMEM; + t->nfit_size = nfit_size; + + t->spa_set[0] = test_alloc_coherent(t, SPA0_SIZE, &t->spa_set_dma[0]); + if (!t->spa_set[0]) + return -ENOMEM; + + t->spa_set[1] = test_alloc_coherent(t, SPA1_SIZE, &t->spa_set_dma[1]); + if (!t->spa_set[1]) + return -ENOMEM; + + for (i = 0; i < NUM_DCR; i++) { + t->dimm[i] = test_alloc(t, DIMM_SIZE, &t->dimm_dma[i]); + if (!t->dimm[i]) + return -ENOMEM; + + t->label[i] = test_alloc(t, LABEL_SIZE, &t->label_dma[i]); + if (!t->label[i]) + return -ENOMEM; + sprintf(t->label[i], "label%d", i); + } + + for (i = 0; i < NUM_DCR; i++) { + t->dcr[i] = test_alloc(t, LABEL_SIZE, &t->dcr_dma[i]); + if (!t->dcr[i]) + return -ENOMEM; + } + + return 0; +} + +static int nfit_test1_alloc(struct nfit_test *t) +{ + size_t nfit_size = sizeof(struct acpi_table_nfit) + + sizeof(struct acpi_nfit_system_address) + + sizeof(struct acpi_nfit_memory_map) + + sizeof(struct acpi_nfit_control_region); + + t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma); + if (!t->nfit_buf) + return -ENOMEM; + t->nfit_size = nfit_size; + + t->spa_set[0] = test_alloc_coherent(t, SPA2_SIZE, &t->spa_set_dma[0]); + if (!t->spa_set[0]) + return -ENOMEM; + + return 0; +} + +static void nfit_test_init_header(struct acpi_table_nfit *nfit, size_t size) +{ + memcpy(nfit->header.signature, ACPI_SIG_NFIT, 4); + nfit->header.length = size; + nfit->header.revision = 1; + memcpy(nfit->header.oem_id, "LIBND", 6); + memcpy(nfit->header.oem_table_id, "TEST", 5); + nfit->header.oem_revision = 1; + memcpy(nfit->header.asl_compiler_id, "TST", 4); + nfit->header.asl_compiler_revision = 1; +} + +static void nfit_test0_setup(struct nfit_test *t) +{ + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; + struct acpi_nfit_memory_map *memdev; + void *nfit_buf = t->nfit_buf; + size_t size = t->nfit_size; + struct acpi_nfit_system_address *spa; + struct acpi_nfit_control_region *dcr; + struct acpi_nfit_data_region *bdw; + unsigned int offset; + + nfit_test_init_header(nfit_buf, size); + + /* + * spa0 (interleave first half of dimm0 and dimm1, note storage + * does not actually alias the related block-data-window + * regions) + */ + spa = nfit_buf + sizeof(struct acpi_table_nfit); + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); + spa->range_index = 0+1; + spa->address = t->spa_set_dma[0]; + spa->length = SPA0_SIZE; + + /* + * spa1 (interleave last half of the 4 DIMMS, note storage + * does not actually alias the related block-data-window + * regions) + */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa); + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); + spa->range_index = 1+1; + spa->address = t->spa_set_dma[1]; + spa->length = SPA1_SIZE; + + /* spa2 (dcr0) dimm0 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 2; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 2+1; + spa->address = t->dcr_dma[0]; + spa->length = DCR_SIZE; + + /* spa3 (dcr1) dimm1 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 3; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 3+1; + spa->address = t->dcr_dma[1]; + spa->length = DCR_SIZE; + + /* spa4 (dcr2) dimm2 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 4; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 4+1; + spa->address = t->dcr_dma[2]; + spa->length = DCR_SIZE; + + /* spa5 (dcr3) dimm3 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 5; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_DCR), 16); + spa->range_index = 5+1; + spa->address = t->dcr_dma[3]; + spa->length = DCR_SIZE; + + /* spa6 (bdw for dcr0) dimm0 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 6; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 6+1; + spa->address = t->dimm_dma[0]; + spa->length = DIMM_SIZE; + + /* spa7 (bdw for dcr1) dimm1 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 7; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 7+1; + spa->address = t->dimm_dma[1]; + spa->length = DIMM_SIZE; + + /* spa8 (bdw for dcr2) dimm2 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 8; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 8+1; + spa->address = t->dimm_dma[2]; + spa->length = DIMM_SIZE; + + /* spa9 (bdw for dcr3) dimm3 */ + spa = nfit_buf + sizeof(struct acpi_table_nfit) + sizeof(*spa) * 9; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_BDW), 16); + spa->range_index = 9+1; + spa->address = t->dimm_dma[3]; + spa->length = DIMM_SIZE; + + offset = sizeof(struct acpi_table_nfit) + sizeof(*spa) * 10; + /* mem-region0 (spa0, dimm0) */ + memdev = nfit_buf + offset; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 0+1; + memdev->region_index = 0+1; + memdev->region_size = SPA0_SIZE/2; + memdev->region_offset = t->spa_set_dma[0]; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 2; + + /* mem-region1 (spa0, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map); + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 0; + memdev->range_index = 0+1; + memdev->region_index = 1+1; + memdev->region_size = SPA0_SIZE/2; + memdev->region_offset = t->spa_set_dma[0] + SPA0_SIZE/2; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 2; + + /* mem-region2 (spa1, dimm0) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 2; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 1; + memdev->range_index = 1+1; + memdev->region_index = 0+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1]; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region3 (spa1, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 3; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 1; + memdev->range_index = 1+1; + memdev->region_index = 1+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1] + SPA1_SIZE/4; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region4 (spa1, dimm2) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 4; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[2]; + memdev->physical_id = 2; + memdev->region_id = 0; + memdev->range_index = 1+1; + memdev->region_index = 2+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1] + 2*SPA1_SIZE/4; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region5 (spa1, dimm3) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 5; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[3]; + memdev->physical_id = 3; + memdev->region_id = 0; + memdev->range_index = 1+1; + memdev->region_index = 3+1; + memdev->region_size = SPA1_SIZE/4; + memdev->region_offset = t->spa_set_dma[1] + 3*SPA1_SIZE/4; + memdev->address = SPA0_SIZE/2; + memdev->interleave_index = 0; + memdev->interleave_ways = 4; + + /* mem-region6 (spa/dcr0, dimm0) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 6; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 2+1; + memdev->region_index = 0+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region7 (spa/dcr1, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 7; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 0; + memdev->range_index = 3+1; + memdev->region_index = 1+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region8 (spa/dcr2, dimm2) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 8; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[2]; + memdev->physical_id = 2; + memdev->region_id = 0; + memdev->range_index = 4+1; + memdev->region_index = 2+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region9 (spa/dcr3, dimm3) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 9; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[3]; + memdev->physical_id = 3; + memdev->region_id = 0; + memdev->range_index = 5+1; + memdev->region_index = 3+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region10 (spa/bdw0, dimm0) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 10; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[0]; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 6+1; + memdev->region_index = 0+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region11 (spa/bdw1, dimm1) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 11; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[1]; + memdev->physical_id = 1; + memdev->region_id = 0; + memdev->range_index = 7+1; + memdev->region_index = 1+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region12 (spa/bdw2, dimm2) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 12; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[2]; + memdev->physical_id = 2; + memdev->region_id = 0; + memdev->range_index = 8+1; + memdev->region_index = 2+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + /* mem-region13 (spa/dcr3, dimm3) */ + memdev = nfit_buf + offset + sizeof(struct acpi_nfit_memory_map) * 13; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = handle[3]; + memdev->physical_id = 3; + memdev->region_id = 0; + memdev->range_index = 9+1; + memdev->region_index = 3+1; + memdev->region_size = 0; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + offset = offset + sizeof(struct acpi_nfit_memory_map) * 14; + /* dcr-descriptor0 */ + dcr = nfit_buf + offset; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 0+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[0]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + /* dcr-descriptor1 */ + dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region); + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 1+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[1]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + /* dcr-descriptor2 */ + dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 2; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 2+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[2]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + /* dcr-descriptor3 */ + dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 3; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 3+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~handle[3]; + dcr->windows = 1; + dcr->window_size = DCR_SIZE; + dcr->command_offset = 0; + dcr->command_size = 8; + dcr->status_offset = 8; + dcr->status_size = 4; + + offset = offset + sizeof(struct acpi_nfit_control_region) * 4; + /* bdw0 (spa/dcr0, dimm0) */ + bdw = nfit_buf + offset; + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 0+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + /* bdw1 (spa/dcr1, dimm1) */ + bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region); + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 1+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + /* bdw2 (spa/dcr2, dimm2) */ + bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 2; + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 2+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + /* bdw3 (spa/dcr3, dimm3) */ + bdw = nfit_buf + offset + sizeof(struct acpi_nfit_data_region) * 3; + bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION; + bdw->header.length = sizeof(struct acpi_nfit_data_region); + bdw->region_index = 3+1; + bdw->windows = 1; + bdw->offset = 0; + bdw->size = BDW_SIZE; + bdw->capacity = DIMM_SIZE; + bdw->start_address = 0; + + acpi_desc = &t->acpi_desc; + set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en); + set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); + set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en); + nd_desc = &acpi_desc->nd_desc; + nd_desc->ndctl = nfit_test_ctl; +} + +static void nfit_test1_setup(struct nfit_test *t) +{ + size_t size = t->nfit_size, offset; + void *nfit_buf = t->nfit_buf; + struct acpi_nfit_memory_map *memdev; + struct acpi_nfit_control_region *dcr; + struct acpi_nfit_system_address *spa; + + nfit_test_init_header(nfit_buf, size); + + offset = sizeof(struct acpi_table_nfit); + /* spa0 (flat range with no bdw aliasing) */ + spa = nfit_buf + offset; + spa->header.type = ACPI_NFIT_TYPE_SYSTEM_ADDRESS; + spa->header.length = sizeof(*spa); + memcpy(spa->range_guid, to_nfit_uuid(NFIT_SPA_PM), 16); + spa->range_index = 0+1; + spa->address = t->spa_set_dma[0]; + spa->length = SPA2_SIZE; + + offset += sizeof(*spa); + /* mem-region0 (spa0, dimm0) */ + memdev = nfit_buf + offset; + memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP; + memdev->header.length = sizeof(*memdev); + memdev->device_handle = 0; + memdev->physical_id = 0; + memdev->region_id = 0; + memdev->range_index = 0+1; + memdev->region_index = 0+1; + memdev->region_size = SPA2_SIZE; + memdev->region_offset = 0; + memdev->address = 0; + memdev->interleave_index = 0; + memdev->interleave_ways = 1; + + offset += sizeof(*memdev); + /* dcr-descriptor0 */ + dcr = nfit_buf + offset; + dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION; + dcr->header.length = sizeof(struct acpi_nfit_control_region); + dcr->region_index = 0+1; + dcr->vendor_id = 0xabcd; + dcr->device_id = 0; + dcr->revision_id = 1; + dcr->serial_number = ~0; + dcr->code = 0x201; + dcr->windows = 0; + dcr->window_size = 0; + dcr->command_offset = 0; + dcr->command_size = 0; + dcr->status_offset = 0; + dcr->status_size = 0; +} + +static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw) +{ + struct nfit_blk *nfit_blk = ndbr->blk_provider_data; + struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW]; + struct nd_region *nd_region = &ndbr->nd_region; + unsigned int lane; + + lane = nd_region_acquire_lane(nd_region); + if (rw) + memcpy(mmio->base + dpa, iobuf, len); + else + memcpy(iobuf, mmio->base + dpa, len); + nd_region_release_lane(nd_region, lane); + + return 0; +} + +static int nfit_test_probe(struct platform_device *pdev) +{ + struct nvdimm_bus_descriptor *nd_desc; + struct acpi_nfit_desc *acpi_desc; + struct device *dev = &pdev->dev; + struct nfit_test *nfit_test; + int rc; + + nfit_test = to_nfit_test(&pdev->dev); + + /* common alloc */ + if (nfit_test->num_dcr) { + int num = nfit_test->num_dcr; + + nfit_test->dimm = devm_kcalloc(dev, num, sizeof(void *), + GFP_KERNEL); + nfit_test->dimm_dma = devm_kcalloc(dev, num, sizeof(dma_addr_t), + GFP_KERNEL); + nfit_test->label = devm_kcalloc(dev, num, sizeof(void *), + GFP_KERNEL); + nfit_test->label_dma = devm_kcalloc(dev, num, + sizeof(dma_addr_t), GFP_KERNEL); + nfit_test->dcr = devm_kcalloc(dev, num, + sizeof(struct nfit_test_dcr *), GFP_KERNEL); + nfit_test->dcr_dma = devm_kcalloc(dev, num, + sizeof(dma_addr_t), GFP_KERNEL); + if (nfit_test->dimm && nfit_test->dimm_dma && nfit_test->label + && nfit_test->label_dma && nfit_test->dcr + && nfit_test->dcr_dma) + /* pass */; + else + return -ENOMEM; + } + + if (nfit_test->num_pm) { + int num = nfit_test->num_pm; + + nfit_test->spa_set = devm_kcalloc(dev, num, sizeof(void *), + GFP_KERNEL); + nfit_test->spa_set_dma = devm_kcalloc(dev, num, + sizeof(dma_addr_t), GFP_KERNEL); + if (nfit_test->spa_set && nfit_test->spa_set_dma) + /* pass */; + else + return -ENOMEM; + } + + /* per-nfit specific alloc */ + if (nfit_test->alloc(nfit_test)) + return -ENOMEM; + + nfit_test->setup(nfit_test); + acpi_desc = &nfit_test->acpi_desc; + acpi_desc->dev = &pdev->dev; + acpi_desc->nfit = nfit_test->nfit_buf; + acpi_desc->blk_do_io = nfit_test_blk_do_io; + nd_desc = &acpi_desc->nd_desc; + nd_desc->attr_groups = acpi_nfit_attribute_groups; + acpi_desc->nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc); + if (!acpi_desc->nvdimm_bus) + return -ENXIO; + + rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size); + if (rc) { + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + return rc; + } + + return 0; +} + +static int nfit_test_remove(struct platform_device *pdev) +{ + struct nfit_test *nfit_test = to_nfit_test(&pdev->dev); + struct acpi_nfit_desc *acpi_desc = &nfit_test->acpi_desc; + + nvdimm_bus_unregister(acpi_desc->nvdimm_bus); + + return 0; +} + +static void nfit_test_release(struct device *dev) +{ + struct nfit_test *nfit_test = to_nfit_test(dev); + + kfree(nfit_test); +} + +static const struct platform_device_id nfit_test_id[] = { + { KBUILD_MODNAME }, + { }, +}; + +static struct platform_driver nfit_test_driver = { + .probe = nfit_test_probe, + .remove = nfit_test_remove, + .driver = { + .name = KBUILD_MODNAME, + }, + .id_table = nfit_test_id, +}; + +#ifdef CONFIG_CMA_SIZE_MBYTES +#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES +#else +#define CMA_SIZE_MBYTES 0 +#endif + +static __init int nfit_test_init(void) +{ + int rc, i; + + nfit_test_setup(nfit_test_lookup); + + for (i = 0; i < NUM_NFITS; i++) { + struct nfit_test *nfit_test; + struct platform_device *pdev; + static int once; + + nfit_test = kzalloc(sizeof(*nfit_test), GFP_KERNEL); + if (!nfit_test) { + rc = -ENOMEM; + goto err_register; + } + INIT_LIST_HEAD(&nfit_test->resources); + switch (i) { + case 0: + nfit_test->num_pm = NUM_PM; + nfit_test->num_dcr = NUM_DCR; + nfit_test->alloc = nfit_test0_alloc; + nfit_test->setup = nfit_test0_setup; + break; + case 1: + nfit_test->num_pm = 1; + nfit_test->alloc = nfit_test1_alloc; + nfit_test->setup = nfit_test1_setup; + break; + default: + rc = -EINVAL; + goto err_register; + } + pdev = &nfit_test->pdev; + pdev->name = KBUILD_MODNAME; + pdev->id = i; + pdev->dev.release = nfit_test_release; + rc = platform_device_register(pdev); + if (rc) { + put_device(&pdev->dev); + goto err_register; + } + + rc = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) + goto err_register; + + instances[i] = nfit_test; + + if (!once++) { + dma_addr_t dma; + void *buf; + + buf = dma_alloc_coherent(&pdev->dev, SZ_128M, &dma, + GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + dev_warn(&pdev->dev, "need 128M of free cma\n"); + goto err_register; + } + dma_free_coherent(&pdev->dev, SZ_128M, buf, dma); + } + } + + rc = platform_driver_register(&nfit_test_driver); + if (rc) + goto err_register; + return 0; + + err_register: + for (i = 0; i < NUM_NFITS; i++) + if (instances[i]) + platform_device_unregister(&instances[i]->pdev); + nfit_test_teardown(); + return rc; +} + +static __exit void nfit_test_exit(void) +{ + int i; + + platform_driver_unregister(&nfit_test_driver); + for (i = 0; i < NUM_NFITS; i++) + platform_device_unregister(&instances[i]->pdev); + nfit_test_teardown(); +} + +module_init(nfit_test_init); +module_exit(nfit_test_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h new file mode 100644 index 000000000000..96c5e16d7db9 --- /dev/null +++ b/tools/testing/nvdimm/test/nfit_test.h @@ -0,0 +1,29 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __NFIT_TEST_H__ +#define __NFIT_TEST_H__ + +struct nfit_test_resource { + struct list_head list; + struct resource *res; + struct device *dev; + void *buf; +}; + +typedef struct nfit_test_resource *(*nfit_test_lookup_fn)(resource_size_t); +void __iomem *__wrap_ioremap_nocache(resource_size_t offset, + unsigned long size); +void __wrap_iounmap(volatile void __iomem *addr); +void nfit_test_setup(nfit_test_lookup_fn lookup); +void nfit_test_teardown(void); +#endif -- cgit v1.2.3 From 581388209405902b56d055f644b4dd124a206112 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 23 Jun 2015 20:08:34 -0400 Subject: libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only Upon detection of an unarmed dimm in a region, arrange for descendant BTT, PMEM, or BLK instances to be read-only. A dimm is primarily marked "unarmed" via flags passed by platform firmware (NFIT). The flags in the NFIT memory device sub-structure indicate the state of the data on the nvdimm relative to its energy source or last "flush to persistence". For the most part there is nothing the driver can do but advertise the state of these flags in sysfs and emit a message if firmware indicates that the contents of the device may be corrupted. However, for the case of ACPI_NFIT_MEM_ARMED, the driver can arrange for the block devices incorporating that nvdimm to be marked read-only. This is a safe default as the data is still available and new writes are held off until the administrator either forces read-write mode, or the energy source becomes armed. A 'read_only' attribute is added to REGION devices to allow for overriding the default read-only policy of all descendant block devices. Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 31 +++++++++++++++++++++++++++++++ drivers/acpi/nfit.h | 3 +++ drivers/nvdimm/blk.c | 2 ++ drivers/nvdimm/btt.c | 10 ++++++++-- drivers/nvdimm/bus.c | 18 ++++++++++++++++++ drivers/nvdimm/nd.h | 3 ++- drivers/nvdimm/pmem.c | 2 ++ drivers/nvdimm/region_devs.c | 29 +++++++++++++++++++++++++++++ include/linux/libnvdimm.h | 2 ++ tools/testing/nvdimm/test/nfit.c | 3 +++ 10 files changed, 100 insertions(+), 3 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 07d630e9f4ae..1f6f1b1a54f4 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -668,6 +668,20 @@ static ssize_t serial_show(struct device *dev, } static DEVICE_ATTR_RO(serial); +static ssize_t flags_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u16 flags = to_nfit_memdev(dev)->flags; + + return sprintf(buf, "%s%s%s%s%s\n", + flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "", + flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "", + flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "", + flags & ACPI_NFIT_MEM_ARMED ? "arm " : "", + flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart " : ""); +} +static DEVICE_ATTR_RO(flags); + static struct attribute *acpi_nfit_dimm_attributes[] = { &dev_attr_handle.attr, &dev_attr_phys_id.attr, @@ -676,6 +690,7 @@ static struct attribute *acpi_nfit_dimm_attributes[] = { &dev_attr_format.attr, &dev_attr_serial.attr, &dev_attr_rev_id.attr, + &dev_attr_flags.attr, NULL, }; @@ -768,6 +783,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) struct nvdimm *nvdimm; unsigned long flags = 0; u32 device_handle; + u16 mem_flags; int rc; device_handle = __to_nfit_memdev(nfit_mem)->device_handle; @@ -785,6 +801,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) if (nfit_mem->bdw && nfit_mem->memdev_pmem) flags |= NDD_ALIASING; + mem_flags = __to_nfit_memdev(nfit_mem)->flags; + if (mem_flags & ACPI_NFIT_MEM_ARMED) + flags |= NDD_UNARMED; + rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle); if (rc) continue; @@ -797,6 +817,17 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc) nfit_mem->nvdimm = nvdimm; dimm_count++; + + if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0) + continue; + + dev_info(acpi_desc->dev, "%s: failed: %s%s%s%s\n", + nvdimm_name(nvdimm), + mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "", + mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "", + mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "", + mem_flags & ACPI_NFIT_MEM_ARMED ? "arm " : ""); + } return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count); diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index c62fffea8423..81f2e8c5a79c 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -22,6 +22,9 @@ #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba" #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" +#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ + | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ + | ACPI_NFIT_MEM_ARMED) enum nfit_uuids { NFIT_SPA_VOLATILE, diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 96ef38ceeceb..4f97b248c236 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -232,6 +232,7 @@ static int nd_blk_rw_bytes(struct nd_namespace_common *ndns, static const struct block_device_operations nd_blk_fops = { .owner = THIS_MODULE, + .revalidate_disk = nvdimm_revalidate_disk, }; static int nd_blk_attach_disk(struct nd_namespace_common *ndns, @@ -283,6 +284,7 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns, } set_capacity(disk, available_disk_size >> SECTOR_SHIFT); + revalidate_disk(disk); return 0; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index c02065aed03d..411c7b2bb37a 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1245,6 +1245,7 @@ static const struct block_device_operations btt_fops = { .owner = THIS_MODULE, .rw_page = btt_rw_page, .getgeo = btt_getgeo, + .revalidate_disk = nvdimm_revalidate_disk, }; static int btt_blk_init(struct btt *btt) @@ -1292,6 +1293,7 @@ static int btt_blk_init(struct btt *btt) } } set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); + revalidate_disk(btt->btt_disk); return 0; } @@ -1346,7 +1348,11 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, goto out_free; } - if (btt->init_state != INIT_READY) { + if (btt->init_state != INIT_READY && nd_region->ro) { + dev_info(dev, "%s is read-only, unable to init btt metadata\n", + dev_name(&nd_region->dev)); + goto out_free; + } else if (btt->init_state != INIT_READY) { btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); dev_dbg(dev, "init: %d arenas for %llu rawsize\n", @@ -1361,7 +1367,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, ret = btt_meta_init(btt); if (ret) { dev_err(dev, "init: error in meta_init: %d\n", ret); - return NULL; + goto out_free; } } diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index dd12f38397db..ec59f1f26d95 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -227,6 +227,24 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner, } EXPORT_SYMBOL(__nd_driver_register); +int nvdimm_revalidate_disk(struct gendisk *disk) +{ + struct device *dev = disk->driverfs_dev; + struct nd_region *nd_region = to_nd_region(dev->parent); + const char *pol = nd_region->ro ? "only" : "write"; + + if (nd_region->ro == get_disk_ro(disk)) + return 0; + + dev_info(dev, "%s read-%s, marking %s read-%s\n", + dev_name(&nd_region->dev), pol, disk->disk_name, pol); + set_disk_ro(disk, nd_region->ro); + + return 0; + +} +EXPORT_SYMBOL(nvdimm_revalidate_disk); + static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 4614b00542d1..48b09a210689 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -97,7 +97,7 @@ struct nd_region { u16 ndr_mappings; u64 ndr_size; u64 ndr_start; - int id, num_lanes; + int id, num_lanes, ro; void *provider_data; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; @@ -189,6 +189,7 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); void nvdimm_bus_lock(struct device *dev); void nvdimm_bus_unlock(struct device *dev); bool is_nvdimm_bus_locked(struct device *dev); +int nvdimm_revalidate_disk(struct gendisk *disk); void nvdimm_drvdata_release(struct kref *kref); void put_ndd(struct nvdimm_drvdata *ndd); int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index a9709db0704c..42b766f33e59 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -104,6 +104,7 @@ static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .rw_page = pmem_rw_page, .direct_access = pmem_direct_access, + .revalidate_disk = nvdimm_revalidate_disk, }; static struct pmem_device *pmem_alloc(struct device *dev, @@ -178,6 +179,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns, pmem->pmem_disk = disk; add_disk(disk); + revalidate_disk(disk); return 0; } diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 2cfb3f74bcbf..482ee3e4e04a 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -345,11 +345,35 @@ static ssize_t btt_seed_show(struct device *dev, } static DEVICE_ATTR_RO(btt_seed); +static ssize_t read_only_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ro); +} + +static ssize_t read_only_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool ro; + int rc = strtobool(buf, &ro); + struct nd_region *nd_region = to_nd_region(dev); + + if (rc) + return rc; + + nd_region->ro = ro; + return len; +} +static DEVICE_ATTR_RW(read_only); + static struct attribute *nd_region_attributes[] = { &dev_attr_size.attr, &dev_attr_nstype.attr, &dev_attr_mappings.attr, &dev_attr_btt_seed.attr, + &dev_attr_read_only.attr, &dev_attr_set_cookie.attr, &dev_attr_available_size.attr, &dev_attr_namespace_seed.attr, @@ -641,6 +665,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, struct device *dev; void *region_buf; unsigned int i; + int ro = 0; for (i = 0; i < ndr_desc->num_mappings; i++) { struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; @@ -652,6 +677,9 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, return NULL; } + + if (nvdimm->flags & NDD_UNARMED) + ro = 1; } if (dev_type == &nd_blk_device_type) { @@ -707,6 +735,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->provider_data = ndr_desc->provider_data; nd_region->nd_set = ndr_desc->nd_set; nd_region->num_lanes = ndr_desc->num_lanes; + nd_region->ro = ro; ida_init(&nd_region->ns_ida); ida_init(&nd_region->btt_ida); dev = &nd_region->dev; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 7fc1b25bdb5d..dc799a29ed1a 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -21,6 +21,8 @@ enum { /* when a dimm supports both PMEM and BLK access a label is required */ NDD_ALIASING = 1 << 0, + /* unarmed memory devices may not persist writes */ + NDD_UNARMED = 1 << 1, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 7a4a5a5edbe4..4b69b8368de0 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -874,6 +874,9 @@ static void nfit_test1_setup(struct nfit_test *t) memdev->address = 0; memdev->interleave_index = 0; memdev->interleave_ways = 1; + memdev->flags = ACPI_NFIT_MEM_SAVE_FAILED | ACPI_NFIT_MEM_RESTORE_FAILED + | ACPI_NFIT_MEM_FLUSH_FAILED | ACPI_NFIT_MEM_HEALTH_OBSERVED + | ACPI_NFIT_MEM_ARMED; offset += sizeof(*memdev); /* dcr-descriptor0 */ -- cgit v1.2.3 From 99759869faf15471cfce251bc138848d8af7d162 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 17:14:15 -0600 Subject: acpi: Add acpi_map_pxm_to_online_node() The kernel initializes CPU & memory's NUMA topology from ACPI SRAT table. Some other ACPI tables, such as NFIT and DMAR, also contain proximity IDs for their device's NUMA topology. This information can be used to improve performance of these devices. This patch introduces acpi_map_pxm_to_online_node(), which is similar to acpi_map_pxm_to_node(), but always returns an online node. When the mapped node from a given proximity ID is offline, it looks up the node distance table and returns the nearest online node. ACPI device drivers, which are called after the NUMA initialization has completed in the kernel, can call this interface to obtain their device NUMA topology from ACPI tables. Such drivers do not have to deal with offline nodes. A node may be offline when a device proximity ID is unique, SRAT memory entry does not exist, or NUMA is disabled, ex. "numa=off" on x86. This patch also moves the pxm range check from acpi_get_node() to acpi_map_pxm_to_node(). Signed-off-by: Toshi Kani Acked-by: Rafael J. Wysocki > Signed-off-by: Dan Williams --- drivers/acpi/numa.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/acpi.h | 5 +++++ 2 files changed, 52 insertions(+), 3 deletions(-) (limited to 'drivers/acpi') diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 1333cbdc3ea2..acaa3b4ea504 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #define PREFIX "ACPI: " @@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node) int acpi_map_pxm_to_node(int pxm) { - int node = pxm_to_node_map[pxm]; + int node; + + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS) + return NUMA_NO_NODE; + + node = pxm_to_node_map[pxm]; if (node == NUMA_NO_NODE) { if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) @@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm) return node; } +/** + * acpi_map_pxm_to_online_node - Map proximity ID to online node + * @pxm: ACPI proximity ID + * + * This is similar to acpi_map_pxm_to_node(), but always returns an online + * node. When the mapped node from a given proximity ID is offline, it + * looks up the node distance table and returns the nearest online node. + * + * ACPI device drivers, which are called after the NUMA initialization has + * completed in the kernel, can call this interface to obtain their device + * NUMA topology from ACPI tables. Such drivers do not have to deal with + * offline nodes. A node may be offline when a device proximity ID is + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. + * "numa=off" on x86. + */ +int acpi_map_pxm_to_online_node(int pxm) +{ + int node, n, dist, min_dist; + + node = acpi_map_pxm_to_node(pxm); + + if (node == NUMA_NO_NODE) + node = 0; + + if (!node_online(node)) { + min_dist = INT_MAX; + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + node = n; + } + } + } + + return node; +} +EXPORT_SYMBOL(acpi_map_pxm_to_online_node); + static void __init acpi_table_print_srat_entry(struct acpi_subtable_header *header) { @@ -328,8 +374,6 @@ int acpi_get_node(acpi_handle handle) int pxm; pxm = acpi_get_pxm(handle); - if (pxm < 0 || pxm >= MAX_PXM_DOMAINS) - return NUMA_NO_NODE; return acpi_map_pxm_to_node(pxm); } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index e4da5e35e29c..1b3bbb11d11c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -289,8 +289,13 @@ extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d); extern void acpi_osi_setup(char *str); #ifdef CONFIG_ACPI_NUMA +int acpi_map_pxm_to_online_node(int pxm); int acpi_get_node(acpi_handle handle); #else +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + return 0; +} static inline int acpi_get_node(acpi_handle handle) { return 0; -- cgit v1.2.3 From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:33 -0600 Subject: libnvdimm: Set numa_node to NVDIMM devices ACPI NFIT table has System Physical Address Range Structure entries that describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is set in the flags. Change acpi_nfit_register_region() to map a proximity ID to its node ID, and set it to a new numa_node field of nd_region_desc, which is then conveyed to the nd_region device. The device core arranges for btt and namespace devices to inherit their node from their parent region. Signed-off-by: Toshi Kani [djbw: move set_dev_node() from region.c to bus.c] Signed-off-by: Dan Williams --- arch/x86/kernel/pmem.c | 1 + drivers/acpi/nfit.c | 6 ++++++ drivers/nvdimm/bus.c | 6 ++++++ drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/region_devs.c | 1 + include/linux/libnvdimm.h | 1 + 6 files changed, 16 insertions(+), 1 deletion(-) (limited to 'drivers/acpi') diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 0f4ef472ab9e..64f90f53bb85 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -67,6 +67,7 @@ static __init int register_e820_pmem(void) memset(&ndr_desc, 0, sizeof(ndr_desc)); ndr_desc.res = &res; ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = NUMA_NO_NODE; if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) goto err; } diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 1f6f1b1a54f4..d96c8fe974dd 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc->res = &res; ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; + if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) + ndr_desc->numa_node = acpi_map_pxm_to_online_node( + spa->proximity_domain); + else + ndr_desc->numa_node = NUMA_NO_NODE; + list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) { struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping *nd_mapping; diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index ec59f1f26d95..205344643852 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -48,6 +48,12 @@ static int to_nd_device_type(struct device *dev) static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { + /* + * Ensure that region devices always have their numa node set as + * early as possible. + */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + set_dev_node(dev, to_nd_region(dev)->numa_node); return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, to_nd_device_type(dev)); } diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index 48b09a210689..c41f53e74277 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -97,7 +97,7 @@ struct nd_region { u16 ndr_mappings; u64 ndr_size; u64 ndr_start; - int id, num_lanes, ro; + int id, num_lanes, ro, numa_node; void *provider_data; struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 482ee3e4e04a..a5233422f9dc 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -736,6 +736,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->nd_set = ndr_desc->nd_set; nd_region->num_lanes = ndr_desc->num_lanes; nd_region->ro = ro; + nd_region->numa_node = ndr_desc->numa_node; ida_init(&nd_region->ns_ida); ida_init(&nd_region->btt_ida); dev = &nd_region->dev; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index dc799a29ed1a..30b3deaafd51 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -89,6 +89,7 @@ struct nd_region_desc { struct nd_interleave_set *nd_set; void *provider_data; int num_lanes; + int numa_node; }; struct nvdimm_bus; -- cgit v1.2.3 From 74ae66c3b14ffa94c8d2dea201cdf8e6203d13d5 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 19 Jun 2015 12:18:34 -0600 Subject: libnvdimm: Add sysfs numa_node to NVDIMM devices Add support of sysfs 'numa_node' to I/O-related NVDIMM devices under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.x. An example of numa_node values on a 2-socket system with a single NVDIMM range on each socket is shown below. /sys/bus/nd/devices |-- btt0.0/numa_node:0 |-- btt1.0/numa_node:1 |-- btt1.1/numa_node:1 |-- namespace0.0/numa_node:0 |-- namespace1.0/numa_node:1 |-- region0/numa_node:0 |-- region1/numa_node:1 These numa_node files are then linked under the block class of their device names. /sys/class/block/pmem0/device/numa_node:0 /sys/class/block/pmem1s/device/numa_node:1 This enables numactl(8) to accept 'block:' and 'file:' paths of pmem and btt devices as shown in the examples below. numactl --preferred block:pmem0 --show numactl --preferred file:/dev/pmem1s --show Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- drivers/acpi/nfit.c | 1 + drivers/nvdimm/btt_devs.c | 1 + drivers/nvdimm/bus.c | 30 ++++++++++++++++++++++++++++++ drivers/nvdimm/namespace_devs.c | 1 + include/linux/libnvdimm.h | 1 + 5 files changed, 34 insertions(+) (limited to 'drivers/acpi') diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index d96c8fe974dd..2161fa178c8d 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = { &nd_region_attribute_group, &nd_mapping_attribute_group, &nd_device_attribute_group, + &nd_numa_attribute_group, &acpi_nfit_region_attribute_group, NULL, }; diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 661aacedc140..6ac8c0fea3ec 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -293,6 +293,7 @@ static struct attribute_group nd_btt_attribute_group = { static const struct attribute_group *nd_btt_attribute_groups[] = { &nd_btt_attribute_group, &nd_device_attribute_group, + &nd_numa_attribute_group, NULL, }; diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 205344643852..8eb22c0ca7ce 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -280,6 +280,36 @@ struct attribute_group nd_device_attribute_group = { }; EXPORT_SYMBOL_GPL(nd_device_attribute_group); +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static struct attribute *nd_numa_attributes[] = { + &dev_attr_numa_node.attr, + NULL, +}; + +static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA)) + return 0; + + return a->mode; +} + +/** + * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus + */ +struct attribute_group nd_numa_attribute_group = { + .attrs = nd_numa_attributes, + .is_visible = nd_numa_attr_visible, +}; +EXPORT_SYMBOL_GPL(nd_numa_attribute_group); + int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) { dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 27d69bd3b4d6..fef0dd80d4ad 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1228,6 +1228,7 @@ static struct attribute_group nd_namespace_attribute_group = { static const struct attribute_group *nd_namespace_attribute_groups[] = { &nd_device_attribute_group, &nd_namespace_attribute_group, + &nd_numa_attribute_group, NULL, }; diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 30b3deaafd51..75e3af01ee32 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -38,6 +38,7 @@ enum { extern struct attribute_group nvdimm_bus_attribute_group; extern struct attribute_group nvdimm_attribute_group; extern struct attribute_group nd_device_attribute_group; +extern struct attribute_group nd_numa_attribute_group; extern struct attribute_group nd_region_attribute_group; extern struct attribute_group nd_mapping_attribute_group; -- cgit v1.2.3