From 5d64efe79703c4de1f017af9f7643ead1dc85432 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:51 -0500 Subject: pmem: Add functions for reading/writing page to/from pmem This splits pmem_do_bvec() into pmem_do_read() and pmem_do_write(). pmem_do_write() will be used by pmem zero_page_range() as well. Hence sharing the same code. Suggested-by: Christoph Hellwig Signed-off-by: Vivek Goyal Reviewed-by: Christoph Hellwig Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200228163456.1587-2-vgoyal@redhat.com Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 86 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 36 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4eae441f86c9..075b11682192 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -136,9 +136,25 @@ static blk_status_t read_pmem(struct page *page, unsigned int off, return BLK_STS_OK; } -static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, unsigned int op, - sector_t sector) +static blk_status_t pmem_do_read(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) +{ + blk_status_t rc; + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; + void *pmem_addr = pmem->virt_addr + pmem_off; + + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return BLK_STS_IOERR; + + rc = read_pmem(page, page_off, pmem_addr, len); + flush_dcache_page(page); + return rc; +} + +static blk_status_t pmem_do_write(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) { blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; @@ -148,34 +164,25 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; - if (!op_is_write(op)) { - if (unlikely(bad_pmem)) - rc = BLK_STS_IOERR; - else { - rc = read_pmem(page, off, pmem_addr, len); - flush_dcache_page(page); - } - } else { - /* - * Note that we write the data both before and after - * clearing poison. The write before clear poison - * handles situations where the latest written data is - * preserved and the clear poison operation simply marks - * the address range as valid without changing the data. - * In this case application software can assume that an - * interrupted write will either return the new good - * data or an error. - * - * However, if pmem_clear_poison() leaves the data in an - * indeterminate state we need to perform the write - * after clear poison. - */ - flush_dcache_page(page); - write_pmem(pmem_addr, page, off, len); - if (unlikely(bad_pmem)) { - rc = pmem_clear_poison(pmem, pmem_off, len); - write_pmem(pmem_addr, page, off, len); - } + /* + * Note that we write the data both before and after + * clearing poison. The write before clear poison + * handles situations where the latest written data is + * preserved and the clear poison operation simply marks + * the address range as valid without changing the data. + * In this case application software can assume that an + * interrupted write will either return the new good + * data or an error. + * + * However, if pmem_clear_poison() leaves the data in an + * indeterminate state we need to perform the write + * after clear poison. + */ + flush_dcache_page(page); + write_pmem(pmem_addr, page, page_off, len); + if (unlikely(bad_pmem)) { + rc = pmem_clear_poison(pmem, pmem_off, len); + write_pmem(pmem_addr, page, page_off, len); } return rc; @@ -197,8 +204,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { - rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, - bvec.bv_offset, bio_op(bio), iter.bi_sector); + if (op_is_write(bio_op(bio))) + rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); + else + rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); if (rc) { bio->bi_status = rc; break; @@ -223,9 +234,12 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct pmem_device *pmem = bdev->bd_queue->queuedata; blk_status_t rc; - rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, - 0, op, sector); - + if (op_is_write(op)) + rc = pmem_do_write(pmem, page, 0, sector, + hpage_nr_pages(page) * PAGE_SIZE); + else + rc = pmem_do_read(pmem, page, 0, sector, + hpage_nr_pages(page) * PAGE_SIZE); /* * The ->rw_page interface is subtle and tricky. The core * retries on any error, so we can only invoke page_endio() in -- cgit v1.2.3 From f605a263e0690177ecc180417eacf2b5507dd177 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:52 -0500 Subject: dax, pmem: Add a dax operation zero_page_range Add a dax operation zero_page_range, to zero a page. This will also clear any known poison in the page being zeroed. As of now, zeroing of one page is allowed in a single call. There are no callers which are trying to zero more than a page in a single call. Once we grow the callers which zero more than a page in single call, we can add that support. Primary reason for not doing that yet is that this will add little complexity in dm implementation where a range might be spanning multiple underlying targets and one will have to split the range into multiple sub ranges and call zero_page_range() on individual targets. Suggested-by: Christoph Hellwig Signed-off-by: Vivek Goyal Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200228163456.1587-3-vgoyal@redhat.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 20 ++++++++++++++++++++ drivers/nvdimm/pmem.c | 11 +++++++++++ include/linux/dax.h | 4 ++++ 3 files changed, 35 insertions(+) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0aa4b6bc5101..e498daf3c0d7 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -344,6 +344,26 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, } EXPORT_SYMBOL_GPL(dax_copy_to_iter); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + if (!dax_alive(dax_dev)) + return -ENXIO; + + if (!dax_dev->ops->zero_page_range) + return -EOPNOTSUPP; + /* + * There are no callers that want to zero more than one page as of now. + * Once users are there, this check can be removed after the + * device mapper code has been updated to split ranges across targets. + */ + if (nr_pages != 1) + return -EIO; + + return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 075b11682192..5b774ddd0efb 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -282,6 +282,16 @@ static const struct block_device_operations pmem_fops = { .revalidate_disk = nvdimm_revalidate_disk, }; +static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0, + PFN_PHYS(pgoff) >> SECTOR_SHIFT, + PAGE_SIZE)); +} + static long pmem_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -313,6 +323,7 @@ static const struct dax_operations pmem_dax_ops = { .dax_supported = generic_fsdax_supported, .copy_from_iter = pmem_copy_from_iter, .copy_to_iter = pmem_copy_to_iter, + .zero_page_range = pmem_dax_zero_page_range, }; static const struct attribute_group *pmem_attribute_groups[] = { diff --git a/include/linux/dax.h b/include/linux/dax.h index 328c2dbb4409..71735c430c05 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -34,6 +34,8 @@ struct dax_operations { /* copy_to_iter: required operation for fs-dax direct-i/o */ size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); + /* zero_page_range: required operation. Zero page range */ + int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; extern struct attribute_group dax_attribute_group; @@ -199,6 +201,8 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From 4e4ced93794acb42adb19484132966defba8f3a6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Apr 2020 12:11:25 -0400 Subject: dax: Move mandatory ->zero_page_range() check in alloc_dax() zero_page_range() dax operation is mandatory for dax devices. Right now that check happens in dax_zero_page_range() function. Dan thinks that's too late and its better to do the check earlier in alloc_dax(). I also modified alloc_dax() to return pointer with error code in it in case of failure. Right now it returns NULL and caller assumes failure happened due to -ENOMEM. But with this ->zero_page_range() check, I need to return -EINVAL instead. Signed-off-by: Vivek Goyal Link: https://lore.kernel.org/r/20200401161125.GB9398@redhat.com Signed-off-by: Dan Williams --- drivers/dax/bus.c | 4 +++- drivers/dax/super.c | 14 +++++++++----- drivers/md/dm.c | 2 +- drivers/nvdimm/pmem.c | 4 ++-- drivers/s390/block/dcssblk.c | 5 +++-- 5 files changed, 18 insertions(+), 11 deletions(-) (limited to 'drivers/nvdimm/pmem.c') diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 46e46047a1f7..df238c8b6ef2 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -421,8 +421,10 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, * device outside of mmap of the resulting character device. */ dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); - if (!dax_dev) + if (IS_ERR(dax_dev)) { + rc = PTR_ERR(dax_dev); goto err; + } /* a device_dax instance is dead while the driver is not attached */ kill_dax(dax_dev); diff --git a/drivers/dax/super.c b/drivers/dax/super.c index e498daf3c0d7..8e32345be0f7 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -349,9 +349,6 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, { if (!dax_alive(dax_dev)) return -ENXIO; - - if (!dax_dev->ops->zero_page_range) - return -EOPNOTSUPP; /* * There are no callers that want to zero more than one page as of now. * Once users are there, this check can be removed after the @@ -571,9 +568,16 @@ struct dax_device *alloc_dax(void *private, const char *__host, dev_t devt; int minor; + if (ops && !ops->zero_page_range) { + pr_debug("%s: error: device does not provide dax" + " operation zero_page_range()\n", + __host ? __host : "Unknown"); + return ERR_PTR(-EINVAL); + } + host = kstrdup(__host, GFP_KERNEL); if (__host && !host) - return NULL; + return ERR_PTR(-ENOMEM); minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); if (minor < 0) @@ -596,7 +600,7 @@ struct dax_device *alloc_dax(void *private, const char *__host, ida_simple_remove(&dax_minor_ida, minor); err_minor: kfree(host); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(alloc_dax); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index aa72d9e757c1..6504f0a358e5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2005,7 +2005,7 @@ static struct mapped_device *alloc_dev(int minor) if (IS_ENABLED(CONFIG_DAX_DRIVER)) { md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops, 0); - if (!md->dax_dev) + if (IS_ERR(md->dax_dev)) goto bad; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b774ddd0efb..715cb0696525 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -487,9 +487,9 @@ static int pmem_attach_disk(struct device *dev, if (is_nvdimm_sync(nd_region)) flags = DAXDEV_F_SYNC; dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); - if (!dax_dev) { + if (IS_ERR(dax_dev)) { put_disk(disk); - return -ENOMEM; + return PTR_ERR(dax_dev); } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index ab3e2898816c..9b05cf1a25c4 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -695,8 +695,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, &dcssblk_dax_ops, DAXDEV_F_SYNC); - if (!dev_info->dax_dev) { - rc = -ENOMEM; + if (IS_ERR(dev_info->dax_dev)) { + rc = PTR_ERR(dev_info->dax_dev); + dev_info->dax_dev = NULL; goto put_dev; } -- cgit v1.2.3