diff options
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r-- | fs/block_dev.c | 207 |
1 files changed, 163 insertions, 44 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 198243717da5..ba762ea07f67 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -28,6 +28,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/dax.h> #include <asm/uaccess.h> #include "internal.h" @@ -49,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -static void bdev_write_inode(struct inode *inode) +static void bdev_write_inode(struct block_device *bdev) { + struct inode *inode = bdev->bd_inode; + int ret; + spin_lock(&inode->i_lock); while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); - WARN_ON_ONCE(write_inode_now(inode, true)); + ret = write_inode_now(inode, true); + if (ret) { + char name[BDEVNAME_SIZE]; + pr_warn_ratelimited("VFS: Dirty inode writeback failed " + "for block device %s (err=%d).\n", + bdevname(bdev, name), ret); + } spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); @@ -146,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -328,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; mutex_lock(&bd_inode->i_mutex); @@ -339,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -380,9 +395,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; + int result = -EOPNOTSUPP; + if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + return result; + + result = blk_queue_enter(bdev->bd_queue, false); + if (result) + return result; + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + blk_queue_exit(bdev->bd_queue); + return result; } EXPORT_SYMBOL_GPL(bdev_read_page); @@ -411,14 +434,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; + result = blk_queue_enter(bdev->bd_queue, false); + if (result) + return result; + set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); if (result) end_page_writeback(page); else unlock_page(page); + blk_queue_exit(bdev->bd_queue); return result; } EXPORT_SYMBOL_GPL(bdev_write_page); @@ -426,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); /** * bdev_direct_access() - Get the address for directly-accessibly memory * @bdev: The device containing the memory - * @sector: The offset within the device - * @addr: Where to put the address of the memory - * @pfn: The Page Frame Number for the memory - * @size: The number of bytes requested + * @dax: control and output parameters for ->direct_access * * If a block device is made up of directly addressable memory, this function * will tell the caller the PFN and the address of the memory. The address @@ -440,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * Return: negative errno if an error occurs, otherwise the number of bytes * accessible at this address. */ -long bdev_direct_access(struct block_device *bdev, sector_t sector, - void **addr, unsigned long *pfn, long size) +long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) { - long avail; + sector_t sector = dax->sector; + long avail, size = dax->size; const struct block_device_operations *ops = bdev->bd_disk->fops; /* @@ -462,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn, size); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); if (!avail) return -ERANGE; + if (avail > 0 && avail & ~PAGE_MASK) + return -ENXIO; return min(avail, size); } EXPORT_SYMBOL_GPL(bdev_direct_access); @@ -566,7 +594,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) @@ -672,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - ihold(bdev->bd_inode); + bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } @@ -688,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode) * So, we can access it via ->i_mapping always * without igrab(). */ - ihold(bdev->bd_inode); + bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); @@ -711,7 +739,7 @@ void bd_forget(struct inode *inode) spin_unlock(&bdev_lock); if (bdev) - iput(bdev->bd_inode); + bdput(bdev); } /** @@ -1018,12 +1046,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); static void flush_disk(struct block_device *bdev, bool kill_dirty) { if (__invalidate_device(bdev, kill_dirty)) { - char name[BDEVNAME_SIZE] = ""; - - if (bdev->bd_disk) - disk_name(bdev->bd_disk, 0, name); printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", name); + "resized disk %s\n", + bdev->bd_disk ? bdev->bd_disk->disk_name : ""); } if (!bdev->bd_disk) @@ -1047,12 +1072,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - char name[BDEVNAME_SIZE]; - - disk_name(disk, 0, name); printk(KERN_INFO "%s: detected capacity change from %lld to %lld\n", - name, bdev_size, disk_size); + disk->disk_name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); flush_disk(bdev, false); } @@ -1074,7 +1096,7 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - + blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; @@ -1206,8 +1228,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1221,6 +1246,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1241,6 +1267,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; } } else { if (bdev->bd_contains == bdev) { @@ -1492,11 +1520,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + + bdev_write_inode(bdev); /* - * ->release can cause the queue to disappear, so flush all - * dirty data before. + * Detaching bdev inode from its wb in __destroy_inode() + * is too late: the queue which embeds its bdi (along with + * root wb) can be gone as soon as we put_disk() below. */ - bdev_write_inode(bdev->bd_inode); + inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -1571,14 +1602,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1603,7 +1634,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1635,7 +1666,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; @@ -1674,13 +1705,101 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static void blkdev_vm_open(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + mutex_unlock(&bd_inode->i_mutex); +} + +static void blkdev_vm_close(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count--; + mutex_unlock(&bd_inode->i_mutex); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_fault, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + + file_accessed(file); + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + mutex_unlock(&bd_inode->i_mutex); + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT @@ -1769,7 +1888,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; @@ -1781,13 +1900,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -1795,8 +1914,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(I_BDEV(inode), arg); - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } |