diff options
Diffstat (limited to 'drivers/dax/dax.c')
-rw-r--r-- | drivers/dax/dax.c | 579 |
1 files changed, 351 insertions, 228 deletions
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 803f3953b341..0e499bfca41c 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -13,15 +13,25 @@ #include <linux/pagemap.h> #include <linux/module.h> #include <linux/device.h> +#include <linux/mount.h> #include <linux/pfn_t.h> +#include <linux/hash.h> +#include <linux/cdev.h> #include <linux/slab.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> +#include "dax.h" -static int dax_major; +static dev_t dax_devt; static struct class *dax_class; static DEFINE_IDA(dax_minor_ida); +static int nr_dax = CONFIG_NR_DEV_DAX; +module_param(nr_dax, int, S_IRUGO); +static struct vfsmount *dax_mnt; +static struct kmem_cache *dax_cache __read_mostly; +static struct super_block *dax_superblock __read_mostly; +MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); /** * struct dax_region - mapping infrastructure for dax devices @@ -48,7 +58,7 @@ struct dax_region { * struct dax_dev - subdivision of a dax region * @region - parent region * @dev - device backing the character device - * @kref - enable this data to be tracked in filp->private_data + * @cdev - core chardev data * @alive - !alive + rcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device @@ -56,41 +66,139 @@ struct dax_region { */ struct dax_dev { struct dax_region *region; - struct device *dev; - struct kref kref; + struct inode *inode; + struct device dev; + struct cdev cdev; bool alive; int id; int num_resources; struct resource res[0]; }; -static void dax_region_free(struct kref *kref) +static struct inode *dax_alloc_inode(struct super_block *sb) { - struct dax_region *dax_region; + return kmem_cache_alloc(dax_cache, GFP_KERNEL); +} - dax_region = container_of(kref, struct dax_region, kref); - kfree(dax_region); +static void dax_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(dax_cache, inode); } -void dax_region_put(struct dax_region *dax_region) +static void dax_destroy_inode(struct inode *inode) { - kref_put(&dax_region->kref, dax_region_free); + call_rcu(&inode->i_rcu, dax_i_callback); } -EXPORT_SYMBOL_GPL(dax_region_put); -static void dax_dev_free(struct kref *kref) +static const struct super_operations dax_sops = { + .statfs = simple_statfs, + .alloc_inode = dax_alloc_inode, + .destroy_inode = dax_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static struct dentry *dax_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - struct dax_dev *dax_dev; + return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); +} - dax_dev = container_of(kref, struct dax_dev, kref); - dax_region_put(dax_dev->region); - kfree(dax_dev); +static struct file_system_type dax_type = { + .name = "dax", + .mount = dax_mount, + .kill_sb = kill_anon_super, +}; + +static int dax_test(struct inode *inode, void *data) +{ + return inode->i_cdev == data; +} + +static int dax_set(struct inode *inode, void *data) +{ + inode->i_cdev = data; + return 0; +} + +static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) +{ + struct inode *inode; + + inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), + dax_test, dax_set, cdev); + + if (!inode) + return NULL; + + if (inode->i_state & I_NEW) { + inode->i_mode = S_IFCHR; + inode->i_flags = S_DAX; + inode->i_rdev = devt; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + unlock_new_inode(inode); + } + return inode; +} + +static void init_once(void *inode) +{ + inode_init_once(inode); +} + +static int dax_inode_init(void) +{ + int rc; + + dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + if (!dax_cache) + return -ENOMEM; + + rc = register_filesystem(&dax_type); + if (rc) + goto err_register_fs; + + dax_mnt = kern_mount(&dax_type); + if (IS_ERR(dax_mnt)) { + rc = PTR_ERR(dax_mnt); + goto err_mount; + } + dax_superblock = dax_mnt->mnt_sb; + + return 0; + + err_mount: + unregister_filesystem(&dax_type); + err_register_fs: + kmem_cache_destroy(dax_cache); + + return rc; } -static void dax_dev_put(struct dax_dev *dax_dev) +static void dax_inode_exit(void) +{ + kern_unmount(dax_mnt); + unregister_filesystem(&dax_type); + kmem_cache_destroy(dax_cache); +} + +static void dax_region_free(struct kref *kref) +{ + struct dax_region *dax_region; + + dax_region = container_of(kref, struct dax_region, kref); + kfree(dax_region); +} + +void dax_region_put(struct dax_region *dax_region) { - kref_put(&dax_dev->kref, dax_dev_free); + kref_put(&dax_region->kref, dax_region_free); } +EXPORT_SYMBOL_GPL(dax_region_put); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, unsigned int align, void *addr, @@ -98,8 +206,11 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, { struct dax_region *dax_region; - dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); + if (!IS_ALIGNED(res->start, align) + || !IS_ALIGNED(resource_size(res), align)) + return NULL; + dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); if (!dax_region) return NULL; @@ -116,10 +227,15 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id, } EXPORT_SYMBOL_GPL(alloc_dax_region); +static struct dax_dev *to_dax_dev(struct device *dev) +{ + return container_of(dev, struct dax_dev, dev); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct dax_dev *dax_dev = dev_get_drvdata(dev); + struct dax_dev *dax_dev = to_dax_dev(dev); unsigned long long size = 0; int i; @@ -144,180 +260,11 @@ static const struct attribute_group *dax_attribute_groups[] = { NULL, }; -static void unregister_dax_dev(void *_dev) -{ - struct device *dev = _dev; - struct dax_dev *dax_dev = dev_get_drvdata(dev); - struct dax_region *dax_region = dax_dev->region; - - dev_dbg(dev, "%s\n", __func__); - - /* - * Note, rcu is not protecting the liveness of dax_dev, rcu is - * ensuring that any fault handlers that might have seen - * dax_dev->alive == true, have completed. Any fault handlers - * that start after synchronize_rcu() has started will abort - * upon seeing dax_dev->alive == false. - */ - dax_dev->alive = false; - synchronize_rcu(); - - get_device(dev); - device_unregister(dev); - ida_simple_remove(&dax_region->ida, dax_dev->id); - ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); - put_device(dev); - dax_dev_put(dax_dev); -} - -int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, - int count) -{ - struct device *parent = dax_region->dev; - struct dax_dev *dax_dev; - struct device *dev; - int rc, minor; - dev_t dev_t; - - dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); - if (!dax_dev) - return -ENOMEM; - memcpy(dax_dev->res, res, sizeof(*res) * count); - dax_dev->num_resources = count; - kref_init(&dax_dev->kref); - dax_dev->alive = true; - dax_dev->region = dax_region; - kref_get(&dax_region->kref); - - dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); - if (dax_dev->id < 0) { - rc = dax_dev->id; - goto err_id; - } - - minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); - if (minor < 0) { - rc = minor; - goto err_minor; - } - - dev_t = MKDEV(dax_major, minor); - dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, - dax_attribute_groups, "dax%d.%d", dax_region->id, - dax_dev->id); - if (IS_ERR(dev)) { - rc = PTR_ERR(dev); - goto err_create; - } - dax_dev->dev = dev; - - rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); - if (rc) - return rc; - - return 0; - - err_create: - ida_simple_remove(&dax_minor_ida, minor); - err_minor: - ida_simple_remove(&dax_region->ida, dax_dev->id); - err_id: - dax_dev_put(dax_dev); - - return rc; -} -EXPORT_SYMBOL_GPL(devm_create_dax_dev); - -/* return an unmapped area aligned to the dax region specified alignment */ -static unsigned long dax_dev_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long off, off_end, off_align, len_align, addr_align, align; - struct dax_dev *dax_dev = filp ? filp->private_data : NULL; - struct dax_region *dax_region; - - if (!dax_dev || addr) - goto out; - - dax_region = dax_dev->region; - align = dax_region->align; - off = pgoff << PAGE_SHIFT; - off_end = off + len; - off_align = round_up(off, align); - - if ((off_end <= off_align) || ((off_end - off_align) < align)) - goto out; - - len_align = len + align; - if ((off + len_align) < off) - goto out; - - addr_align = current->mm->get_unmapped_area(filp, addr, len_align, - pgoff, flags); - if (!IS_ERR_VALUE(addr_align)) { - addr_align += (off - addr_align) & (align - 1); - return addr_align; - } - out: - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -static int __match_devt(struct device *dev, const void *data) -{ - const dev_t *devt = data; - - return dev->devt == *devt; -} - -static struct device *dax_dev_find(dev_t dev_t) -{ - return class_find_device(dax_class, NULL, &dev_t, __match_devt); -} - -static int dax_dev_open(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = NULL; - struct device *dev; - - dev = dax_dev_find(inode->i_rdev); - if (!dev) - return -ENXIO; - - device_lock(dev); - dax_dev = dev_get_drvdata(dev); - if (dax_dev) { - dev_dbg(dev, "%s\n", __func__); - filp->private_data = dax_dev; - kref_get(&dax_dev->kref); - inode->i_flags = S_DAX; - } - device_unlock(dev); - - if (!dax_dev) { - put_device(dev); - return -ENXIO; - } - return 0; -} - -static int dax_dev_release(struct inode *inode, struct file *filp) -{ - struct dax_dev *dax_dev = filp->private_data; - struct device *dev = dax_dev->dev; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); - put_device(dev); - - return 0; -} - static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, const char *func) { struct dax_region *dax_region = dax_dev->region; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; unsigned long mask; if (!dax_dev->alive) @@ -382,7 +329,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long) vmf->virtual_address; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; phys_addr_t phys; @@ -422,7 +369,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); @@ -437,7 +384,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, unsigned int flags) { unsigned long pmd_addr = addr & PMD_MASK; - struct device *dev = dax_dev->dev; + struct device *dev = &dax_dev->dev; struct dax_region *dax_region; phys_addr_t phys; pgoff_t pgoff; @@ -459,7 +406,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, } pgoff = linear_page_index(vma, pmd_addr); - phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE); + phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); if (phys == -1) { dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, pgoff); @@ -479,7 +426,7 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; - dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, current->comm, (flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); @@ -490,81 +437,257 @@ static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return rc; } -static void dax_dev_vm_open(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - kref_get(&dax_dev->kref); -} - -static void dax_dev_vm_close(struct vm_area_struct *vma) -{ - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(dax_dev->dev, "%s\n", __func__); - dax_dev_put(dax_dev); -} - static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, .pmd_fault = dax_dev_pmd_fault, - .open = dax_dev_vm_open, - .close = dax_dev_vm_close, }; -static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) +static int dax_mmap(struct file *filp, struct vm_area_struct *vma) { struct dax_dev *dax_dev = filp->private_data; int rc; - dev_dbg(dax_dev->dev, "%s\n", __func__); + dev_dbg(&dax_dev->dev, "%s\n", __func__); rc = check_vma(dax_dev, vma, __func__); if (rc) return rc; - kref_get(&dax_dev->kref); vma->vm_ops = &dax_dev_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; return 0; +} + +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dax_region *dax_region; + + if (!dax_dev || addr) + goto out; + + dax_region = dax_dev->region; + align = dax_region->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static int dax_open(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev; + + dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); + dev_dbg(&dax_dev->dev, "%s\n", __func__); + inode->i_mapping = dax_dev->inode->i_mapping; + inode->i_mapping->host = dax_dev->inode; + filp->f_mapping = inode->i_mapping; + filp->private_data = dax_dev; + inode->i_flags = S_DAX; + + return 0; +} + +static int dax_release(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(&dax_dev->dev, "%s\n", __func__); + return 0; } static const struct file_operations dax_fops = { .llseek = noop_llseek, .owner = THIS_MODULE, - .open = dax_dev_open, - .release = dax_dev_release, - .get_unmapped_area = dax_dev_get_unmapped_area, - .mmap = dax_dev_mmap, + .open = dax_open, + .release = dax_release, + .get_unmapped_area = dax_get_unmapped_area, + .mmap = dax_mmap, }; +static void dax_dev_release(struct device *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + struct dax_region *dax_region = dax_dev->region; + + ida_simple_remove(&dax_region->ida, dax_dev->id); + ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); + dax_region_put(dax_region); + iput(dax_dev->inode); + kfree(dax_dev); +} + +static void unregister_dax_dev(void *dev) +{ + struct dax_dev *dax_dev = to_dax_dev(dev); + struct cdev *cdev = &dax_dev->cdev; + + dev_dbg(dev, "%s\n", __func__); + + /* + * Note, rcu is not protecting the liveness of dax_dev, rcu is + * ensuring that any fault handlers that might have seen + * dax_dev->alive == true, have completed. Any fault handlers + * that start after synchronize_rcu() has started will abort + * upon seeing dax_dev->alive == false. + */ + dax_dev->alive = false; + synchronize_rcu(); + unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); + cdev_del(cdev); + device_unregister(dev); +} + +struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, + struct resource *res, int count) +{ + struct device *parent = dax_region->dev; + struct dax_dev *dax_dev; + int rc = 0, minor, i; + struct device *dev; + struct cdev *cdev; + dev_t dev_t; + + dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); + if (!dax_dev) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + if (!IS_ALIGNED(res[i].start, dax_region->align) + || !IS_ALIGNED(resource_size(&res[i]), + dax_region->align)) { + rc = -EINVAL; + break; + } + dax_dev->res[i].start = res[i].start; + dax_dev->res[i].end = res[i].end; + } + + if (i < count) + goto err_id; + + dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); + if (dax_dev->id < 0) { + rc = dax_dev->id; + goto err_id; + } + + minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) { + rc = minor; + goto err_minor; + } + + dev_t = MKDEV(MAJOR(dax_devt), minor); + dev = &dax_dev->dev; + dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); + if (!dax_dev->inode) { + rc = -ENOMEM; + goto err_inode; + } + + /* device_initialize() so cdev can reference kobj parent */ + device_initialize(dev); + + cdev = &dax_dev->cdev; + cdev_init(cdev, &dax_fops); + cdev->owner = parent->driver->owner; + cdev->kobj.parent = &dev->kobj; + rc = cdev_add(&dax_dev->cdev, dev_t, 1); + if (rc) + goto err_cdev; + + /* from here on we're committed to teardown via dax_dev_release() */ + dax_dev->num_resources = count; + dax_dev->alive = true; + dax_dev->region = dax_region; + kref_get(&dax_region->kref); + + dev->devt = dev_t; + dev->class = dax_class; + dev->parent = parent; + dev->groups = dax_attribute_groups; + dev->release = dax_dev_release; + dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); + rc = device_add(dev); + if (rc) { + put_device(dev); + return ERR_PTR(rc); + } + + rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); + if (rc) + return ERR_PTR(rc); + + return dax_dev; + + err_cdev: + iput(dax_dev->inode); + err_inode: + ida_simple_remove(&dax_minor_ida, minor); + err_minor: + ida_simple_remove(&dax_region->ida, dax_dev->id); + err_id: + kfree(dax_dev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(devm_create_dax_dev); + static int __init dax_init(void) { int rc; - rc = register_chrdev(0, "dax", &dax_fops); - if (rc < 0) + rc = dax_inode_init(); + if (rc) return rc; - dax_major = rc; + + nr_dax = max(nr_dax, 256); + rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); + if (rc) + goto err_chrdev; dax_class = class_create(THIS_MODULE, "dax"); if (IS_ERR(dax_class)) { - unregister_chrdev(dax_major, "dax"); - return PTR_ERR(dax_class); + rc = PTR_ERR(dax_class); + goto err_class; } return 0; + + err_class: + unregister_chrdev_region(dax_devt, nr_dax); + err_chrdev: + dax_inode_exit(); + return rc; } static void __exit dax_exit(void) { class_destroy(dax_class); - unregister_chrdev(dax_major, "dax"); + unregister_chrdev_region(dax_devt, nr_dax); ida_destroy(&dax_minor_ida); + dax_inode_exit(); } MODULE_AUTHOR("Intel Corporation"); |