diff options
Diffstat (limited to 'fs')
167 files changed, 3818 insertions, 2066 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6caca025019d..072e7599583a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 4d4a0df8344f..c9fdfb112933 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -271,7 +271,7 @@ static int __init init_inodecache(void) adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/super.c b/fs/affs/super.c index 8836df5f1e11..2a6713b6b9f4 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -132,7 +132,7 @@ static int __init init_inodecache(void) affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (affs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fb4a5129f7d..81afefe7d8a6 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -91,7 +91,7 @@ int __init afs_fs_init(void) afs_inode_cachep = kmem_cache_create("afs_inode_cache", sizeof(struct afs_vnode), 0, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, afs_i_init_once); if (!afs_inode_cachep) { printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 25250fa87086..cc0e08252913 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -434,7 +434,7 @@ befs_init_inodecache(void) befs_inode_cachep = kmem_cache_create("befs_inode_cache", sizeof (struct befs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (befs_inode_cachep == NULL) { pr_err("%s: Couldn't initialize inode slabcache\n", __func__); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fdcb4d69f430..1e5c896f6b79 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -270,7 +270,7 @@ static int __init init_inodecache(void) bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/block_dev.c b/fs/block_dev.c index 01b8e0d4b4ff..81c0705558be 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -338,7 +343,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; mutex_lock(&bd_inode->i_mutex); @@ -349,7 +354,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -432,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, GFP_NOIO); if (result) return result; @@ -590,7 +595,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) @@ -1224,8 +1229,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1239,6 +1247,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1259,12 +1268,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - /* - * If the partition is not aligned on a page - * boundary, we can't do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || - (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + if (!blkdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1599,14 +1603,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1631,7 +1635,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1663,7 +1667,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; @@ -1702,13 +1706,101 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static void blkdev_vm_open(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + mutex_unlock(&bd_inode->i_mutex); +} + +static void blkdev_vm_close(struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(vma->vm_file); + struct block_device *bdev = I_BDEV(bd_inode); + + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count--; + mutex_unlock(&bd_inode->i_mutex); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_fault, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .open = blkdev_vm_open, + .close = blkdev_vm_close, + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(bd_inode); + + file_accessed(file); + mutex_lock(&bd_inode->i_mutex); + bdev->bd_map_count++; + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + mutex_unlock(&bd_inode->i_mutex); + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b8856e182ae..394017831692 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9161,7 +9161,8 @@ int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f446afada328..ca4d5e8457f1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -639,8 +639,8 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ceph_inode_init_once); if (ceph_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index b7fcb3151103..c4c1169814b2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1092,7 +1092,7 @@ cifs_init_inodecache(void) cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", sizeof(struct cifsInodeInfo), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), cifs_init_once); if (cifs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index cac1390b87a3..57e81cbba0fa 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -74,9 +74,9 @@ static void init_once(void *foo) int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", - sizeof(struct coda_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct coda_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (coda_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 8d38cd07b207..b4539e84e577 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1571,7 +1571,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; @@ -3415,7 +3416,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4f4d0474bee9..e25b6b06bacf 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -663,6 +663,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; + unsigned long flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { @@ -684,6 +685,7 @@ static struct ecryptfs_cache_info { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), + .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { @@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - *(info->cache) = kmem_cache_create(info->name, info->size, - 0, SLAB_HWCACHE_ALIGN, info->ctor); + *(info->cache) = kmem_cache_create(info->name, info->size, 0, + SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " diff --git a/fs/efs/super.c b/fs/efs/super.c index c8411a30f7da..cb68dac4f9d3 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -94,9 +94,9 @@ static void init_once(void *foo) static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", - sizeof(struct efs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct efs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/eventfd.c b/fs/eventfd.c index 8d0c0df01854..ed70cf9fdc7b 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -45,10 +45,10 @@ struct eventfd_ctx { * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returining a POLLERR + * value, and we signal this as overflow condition by returning a POLLERR * to poll(2). * - * Returns the amount by which the counter was incrememnted. This will be less + * Returns the amount by which the counter was incremented. This will be less * than @n if the counter has overflowed. */ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) diff --git a/fs/exofs/super.c b/fs/exofs/super.c index b795c567b5e1..6658a50530a0 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -194,8 +194,8 @@ static int init_inodecache(void) { exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - exofs_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, exofs_init_once); if (exofs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 748d35afc902..2a188413a2b0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -203,7 +203,7 @@ static int __init init_inodecache(void) ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9ab67da6e5a..f1b56ff01208 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -966,7 +966,7 @@ static int __init init_inodecache(void) ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d80474be..3842af954cd5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -237,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); return 0; @@ -410,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi) { struct ino_entry *e, *tmp; int i; @@ -722,47 +722,48 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(fi, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(fi, flag); + list_add_tail(&fi->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || + !is_inode_flag_set(F2FS_I(inode), flag)) + return; + + list_del_init(&fi->dirty_list); + clear_inode_flag(fi, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } @@ -770,70 +771,60 @@ out: void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; - - new->inode = inode; - INIT_LIST_HEAD(&new->list); - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); + spin_lock(&sbi->inode_lock[DIR_INODE]); + __add_dirty_inode(inode, DIR_INODE); + spin_unlock(&sbi->inode_lock[DIR_INODE]); } -void remove_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) - return; + struct f2fs_inode_info *fi = F2FS_I(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { + clear_inode_flag(fi, FI_DELAY_IPUT); iput(inode); } } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -868,11 +859,9 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +874,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, 0, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -919,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -945,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -1030,7 +1018,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1058,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); @@ -1081,22 +1069,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_dirty_inode(sbi); + release_ino_entry(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); @@ -1133,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1139,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac071..ac9e7c6aac74 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -225,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn) /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + if (set_page_dirty(node_page)) + dn->node_changed = true; } int reserve_new_block(struct dnode_of_data *dn) @@ -412,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -441,12 +442,11 @@ repeat: } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < @@ -494,14 +494,10 @@ alloc: if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, +static int __allocate_data_blocks(struct inode *inode, loff_t offset, size_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -510,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, u64 len = F2FS_BYTES_TO_BLK(count); bool allocated; u64 end_offset; + int err = 0; while (len) { - f2fs_balance_fs(sbi); f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) goto out; allocated = false; @@ -526,12 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto sync_out; + } blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) + err = __allocate_data_block(&dn); + if (err) goto sync_out; allocated = true; } @@ -545,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); } - return; + return err; sync_out: if (allocated) @@ -554,7 +556,8 @@ sync_out: f2fs_put_dnode(&dn); out: f2fs_unlock_op(sbi); - return; + f2fs_balance_fs(sbi, dn.node_changed); + return err; } /* @@ -566,7 +569,7 @@ out: * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; @@ -577,6 +580,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; struct extent_info ei; bool allocated = false; + block_t blkaddr; map->m_len = 0; map->m_flags = 0; @@ -592,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -640,12 +644,21 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs++; get_next: + if (map->m_len >= maxblocks) + goto sync_out; + if (dn.ofs_in_node >= end_offset) { if (allocated) sync_inode_page(&dn); allocated = false; f2fs_put_dnode(&dn); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + f2fs_lock_op(sbi); + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { @@ -657,52 +670,53 @@ get_next: end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; } + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } + } - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + /* Give more consecutive addresses for the readahead */ + if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && + blkaddr == NEW_ADDR)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + map->m_len++; + goto get_next; } + sync_out: if (allocated) sync_inode_page(&dn); put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + } out: trace_f2fs_map_blocks(inode, map, err); return err; @@ -742,6 +756,10 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP); } @@ -761,10 +779,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -779,16 +796,19 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, mutex_lock(&inode->i_mutex); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; @@ -800,59 +820,37 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk++) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. + */ + flags |= FIEMAP_EXTENT_LAST; + } - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; + if (start_blk > last_blk || ret) + goto out; - start_blk += logical_to_blk(inode, size); + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -1083,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio) */ if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); @@ -1179,10 +1178,11 @@ out: if (err) ClearPageUptodate(page); unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + remove_dirty_inode(inode); + } return 0; redirty_out: @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; @@ -1369,7 +1373,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (locked) mutex_unlock(&sbi->writepages); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; @@ -1382,13 +1386,85 @@ skip_write: static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + bool restart = false; + + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || (!err && dn.data_blkaddr == NULL_ADDR)) + restart = true; + if (restart) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1396,15 +1472,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1424,41 +1498,27 @@ repeat: *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + if (err) + goto fail; - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; + if (need_balance && has_not_enough_free_secs(sbi, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; } - err = f2fs_get_block(&dn, index); - if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - f2fs_wait_on_page_writeback(page, DATA); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); if (len == PAGE_CACHE_SIZE) goto out_update; @@ -1474,14 +1534,14 @@ put_next: goto out_update; } - if (dn.data_blkaddr == NEW_ADDR) { + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, + .blk_addr = blkaddr, .page = page, .encrypted_page = NULL, }; @@ -1512,10 +1572,6 @@ out_clear: clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1540,6 +1596,7 @@ static int f2fs_write_end(struct file *file, } f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1567,11 +1624,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int err; /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; @@ -1583,11 +1638,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; + err = __allocate_data_blocks(inode, offset, count); + if (err) goto out; - } } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ad1b18a7705b..4fb6ef88a34f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,12 +38,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -189,10 +192,10 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); for (i = 0; i <= UPDATE_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); @@ -267,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -288,8 +292,8 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -297,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - datas: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", @@ -404,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c1678ba8f92..faa7495e2d7e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, namehash = f2fs_dentry_hash(&name); - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, goto out; max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + F2FS_I(dir)->i_current_depth = max_depth; + mark_inode_dirty(dir); + } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, &fname, res_page); @@ -444,7 +450,7 @@ error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); remove_inode_page(inode); return ERR_PTR(err); } @@ -630,6 +636,7 @@ fail: f2fs_put_page(dentry_page, 1); out: f2fs_fname_free_filename(&fname); + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } @@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } @@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -855,25 +865,27 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } out: f2fs_fname_crypto_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e11b..ccd5c636d3fe 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -36,7 +36,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,7 +45,7 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) @@ -68,11 +68,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -131,7 +133,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { @@ -152,7 +154,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -164,34 +166,33 @@ static void __drop_largest_extent(struct inode *inode, largest->len = 0; } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; set_extent_info(&ei, le32_to_cpu(i_ext->fofs), le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +203,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -549,45 +551,44 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_tree *et, *next; struct extent_node *en, *tmp; unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; unsigned int found; unsigned int node_cnt = 0, tree_cnt = 0; int remained; + bool do_free = false; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + } - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -599,15 +600,19 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!remained--) break; list_del_init(&en->list); + do_free = true; } spin_unlock(&sbi->extent_lock); + if (do_free == false) + goto unlock_out; + /* * reset ino for searching victims from beginning of global extent tree. */ ino = F2FS_ROOT_INO(sbi); - while ((found = radix_tree_gang_lookup(root, + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -615,9 +620,13 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) for (i = 0; i < found; i++) { struct extent_tree *et = treevec[i]; - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); + if (!atomic_read(&et->node_cnt)) + continue; + + if (write_trylock(&et->lock)) { + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + } if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; @@ -637,7 +646,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); @@ -656,8 +665,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +679,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -722,7 +734,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec6067c33a3f..ff79054c6cf6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/bio.h> +#include <linux/blkdev.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -54,6 +55,7 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -125,6 +127,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 120 /* 2 mins */ struct cp_control { int reason; @@ -158,13 +161,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -234,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -256,10 +254,16 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + /* * For INODE and NODE manager */ @@ -357,9 +361,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -434,8 +438,8 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + struct list_head dirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -544,6 +548,7 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ block_t data_blkaddr; /* block address of the node block */ }; @@ -647,6 +652,7 @@ struct f2fs_sm_info { enum count_type { F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -695,6 +701,12 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -711,11 +723,17 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ /* for node-related operations */ @@ -737,23 +755,26 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -771,6 +792,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ @@ -809,7 +831,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -824,6 +846,31 @@ struct f2fs_sb_info { unsigned int shrinker_run_no; }; +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ @@ -1059,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1075,9 +1122,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) return; atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -1092,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode) static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; return ((get_pages(sbi, block_type) + pages_per_sec - 1) >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; } @@ -1416,6 +1461,8 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1659,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1765,7 +1812,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *, struct page *); int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1811,9 +1858,9 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1823,9 +1870,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1845,6 +1892,7 @@ struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1875,8 +1923,9 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + int ndirty_node, ndirty_meta; + int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, inmem_pages, wb_pages; @@ -1886,7 +1935,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1907,10 +1956,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1985,14 +2035,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2013,7 +2064,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2067,8 +2118,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a197215ad52b..18ddb1e5182a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -96,6 +96,7 @@ mapped: clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -201,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); clear_inode_flag(fi, FI_NEED_IPU); @@ -233,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. @@ -261,8 +259,10 @@ sync_nodes: sync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto out; + } if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); @@ -275,12 +275,13 @@ sync_nodes: goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); + remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); + remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); @@ -418,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = f2fs_get_encryption_info(inode); if (err) return 0; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -483,11 +483,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) F2FS_I(dn->inode)) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -604,7 +604,7 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -679,13 +679,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode, true); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -727,7 +734,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -778,13 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -815,7 +820,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; @@ -918,7 +923,7 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) int ret = 0; for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); @@ -941,13 +946,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -991,13 +992,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1104,13 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1154,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; - f2fs_balance_fs(sbi); - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -1246,6 +1239,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: @@ -1353,8 +1347,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) return 0; @@ -1363,6 +1355,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return 0; } @@ -1384,8 +1378,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); ret = commit_inmem_pages(inode, false); - if (ret) + if (ret) { + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); goto err_out; + } } ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); @@ -1410,6 +1406,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1441,13 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); - - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, true); + } + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + } mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1487,6 +1488,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) default: return -EINVAL; } + f2fs_update_time(sbi, REQ_TIME); return 0; } @@ -1517,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1540,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) sizeof(policy))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return f2fs_process_policy(&policy, inode); #else return -EOPNOTSUPP; @@ -1586,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1629,7 +1633,6 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1637,13 +1640,196 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + return f2fs_sync_fs(sbi->sb, 1); +} - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; - return 0; + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_CACHE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT; + + f2fs_balance_fs(sbi, true); + + mutex_lock(&inode->i_mutex); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); +out: + mutex_unlock(&inode->i_mutex); + if (!err) + range->len = (u64)total << PAGE_CACHE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1679,6 +1865,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); default: return -ENOTTY; } @@ -1706,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0842..f610c2a9bdde 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> -#include <linux/blkdev.h> #include "f2fs.h" #include "node.h" @@ -173,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -832,8 +831,10 @@ gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..c3f0b7d4cfca 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -16,9 +16,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -177,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; + if (!f2fs_has_inline_data(inode)) + return 0; + page = grab_cache_page(inode->i_mapping, 0); if (!page) return -ENOMEM; @@ -199,6 +199,9 @@ out: f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5528801a5baf..2adeff26be11 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); get_inline_info(fi, ri); @@ -222,7 +223,7 @@ bad_inode: return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; @@ -260,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -279,10 +281,11 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi); } - return; + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -300,9 +303,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -328,7 +330,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -358,9 +360,9 @@ no_delete: if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { if (err && err != -ENOENT) alloc_nid_done(sbi, inode->i_ino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e7587fce1b80..6f944e5eb76e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); @@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -172,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !f2fs_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); @@ -288,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) { @@ -344,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -357,6 +365,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -437,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -448,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + f2fs_balance_fs(sbi, true); + set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -485,8 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -494,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -520,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -536,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -608,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -639,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_whiteout; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -670,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -767,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -811,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -933,7 +944,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; + struct f2fs_str cstr = FSTR_INIT(NULL, 0); struct f2fs_str pstr = FSTR_INIT(NULL, 0); struct f2fs_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); @@ -956,6 +967,12 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; cstr.len = le16_to_cpu(sd->len); + + /* this is broken symlink case */ + if (unlikely(cstr.len == 0)) { + res = -ENOENT; + goto errout; + } cstr.name = kmalloc(cstr.len, GFP_NOFS); if (!cstr.name) { res = -ENOMEM; @@ -964,7 +981,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.name[0] == 0)) { res = -ENOENT; goto errout; } @@ -1005,10 +1022,12 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, +#endif }; #endif diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c40d..342597a5897f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, { struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -379,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + down_write(&nm_i->nat_tree_lock); + /* Check current segment summary */ mutex_lock(&curseg->curseg_mutex); i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); @@ -399,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) cache: /* cache nat entry */ cache_nat_entry(NM_I(sbi), nid, &ne); + up_write(&nm_i->nat_tree_lock); } /* @@ -676,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -750,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -975,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn, fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); - set_page_dirty(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; @@ -1035,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; + if (!nid) + return; + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); @@ -1050,51 +1060,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +/* + * readahead MAX_RA_NODE number of node pages. + */ +void ra_node_pages(struct page *parent, int start) { - struct page *page; - int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } + blk_start_plug(&plug); - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; + /* Then, try readahead for siblings of the desired node */ + end = start + MAX_RA_NODE; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); } - return page; + + blk_finish_plug(&plug); } -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) +struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; struct page *page; - int err, i, end; - nid_t nid; + int err; - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) @@ -1108,46 +1105,53 @@ repeat: goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1); lock_page(page); + + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + f2fs_bug_on(sbi, nid != nid_of_node(page)); return page; } +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_page(sbi, nid, NULL, 0); +} + +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + void sync_inode_page(struct dnode_of_data *dn) { + int ret = 0; + if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); + ret = update_inode(dn->inode, dn->node_page); } else if (dn->inode_page) { if (!dn->inode_page_locked) lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); + ret = update_inode(dn->inode, dn->inode_page); if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - update_inode_page(dn->inode); + ret = update_inode_page(dn->inode); } + dn->node_changed = ret ? true: false; } int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, @@ -1175,6 +1179,11 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + return -EIO; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1349,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page, up_read(&sbi->node_write); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1440,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; - up_read(&nm_i->nat_tree_lock); if (allocated) return 0; } @@ -1532,6 +1538,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1560,6 +1568,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1582,8 +1591,6 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1601,6 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1842,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) raw_ne = nat_in_journal(sum, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); @@ -1883,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: @@ -1920,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } @@ -1937,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1959,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them @@ -1967,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; @@ -1976,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) __adjust_nat_entry_set(setvec[idx], &sets, MAX_NAT_JENTRIES(sum)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d98c4..d4d1f636fe1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -183,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -317,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); @@ -327,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47cce8..589b20b8677b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -459,8 +488,7 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -469,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi, block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -556,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: @@ -595,7 +623,7 @@ out: .reason = CP_RECOVERY, }; mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); } else { mutex_unlock(&sbi->cp_mutex); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f77b3258454a..5904a411c86f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; - - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { + + while (1) { + if (*p == 0) + goto pass; + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); - - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { + + while (1) { + if (*p == ~0UL) + goto pass; + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -233,7 +213,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) * inode becomes free by iget_locked in f2fs_iget. */ if (!abort) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); } @@ -257,6 +237,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) submit_bio = true; } } else { + ClearPageUptodate(cur->page); trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); } set_page_private(cur->page, 0); @@ -281,8 +262,10 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { + if (!need) + return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -310,8 +293,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) + sync_dirty_inodes(sbi, FILE_INODE); f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -1134,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1164,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1749,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, if (le32_to_cpu(nid_in_journal(sum, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL)) return update_nats_in_cursum(sum, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(sum); i++) if (le32_to_cpu(segno_in_journal(sum, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL)) return update_sits_in_cursum(sum, 1); } return -1; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..93606f281bf9 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..6134832baaaf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -67,6 +67,7 @@ enum { Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, Opt_err, }; @@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = { {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, {Opt_err, NULL}, }; @@ -216,7 +218,8 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -235,6 +238,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), NULL, }; @@ -406,6 +410,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -432,6 +439,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -548,7 +556,7 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); @@ -566,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + kfree(sbi->raw_super); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -686,6 +693,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -898,7 +907,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,10 +923,82 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } +static inline bool sanity_check_area_boundary(struct super_block *sb, + struct f2fs_super_block *raw_super) +{ + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != + segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + main_blkaddr, + segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } + + return false; +} + static int sanity_check_raw_super(struct super_block *sb, struct f2fs_super_block *raw_super) { @@ -947,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -965,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sb, raw_super)) + return 1; + return 0; } @@ -1018,7 +1124,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1032,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi) */ static int read_raw_super_block(struct super_block *sb, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { int block = 0; - struct buffer_head *buffer; - struct f2fs_super_block *super; + struct buffer_head *bh; + struct f2fs_super_block *super, *buf; int err = 0; + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; retry: - buffer = sb_bread(sb, block); - if (!buffer) { + bh = sb_bread(sb, block); + if (!bh) { *recovery = 1; f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EIO; - goto out; - } + err = -EIO; + goto next; } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); + buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); + if (sanity_check_raw_super(sb, buf)) { + brelse(bh); *recovery = 1; f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EINVAL; - goto out; - } + err = -EINVAL; + goto next; } if (!*raw_super) { - *raw_super_buf = buffer; + memcpy(super, buf, sizeof(*super)); + *valid_super_block = block; *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); } + brelse(bh); +next: /* check the validity of the second superblock */ if (block == 0) { block++; goto retry; } -out: /* No valid superblock */ - if (!*raw_super) + if (!*raw_super) { + kfree(super); return err; + } return 0; } +static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block) +{ + struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi); + struct buffer_head *bh; + int err; + + bh = sb_getblk(sbi->sb, block); + if (!bh) + return -EIO; + + lock_buffer(bh); + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + brelse(bh); + + return err; +} + int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; int err; /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); - - sbh->b_blocknr = block; + err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - return err; + return __f2fs_commit_super(sbi, sbi->valid_super_block); } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; long err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1150,7 +1260,8 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sb, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; @@ -1167,7 +1278,9 @@ try_onemore: if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -1183,7 +1296,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->sb = sb; sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); @@ -1236,8 +1349,10 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1355,12 +1470,14 @@ try_onemore: f2fs_commit_super(sbi, true); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); @@ -1387,7 +1504,7 @@ free_meta_inode: free_options: kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: kfree(sbi); @@ -1424,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1478,10 +1596,14 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_crypto: diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 036952a945fa..10f1e784fa23 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -571,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -580,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 509411dd3698..6aece96df19f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -677,7 +677,7 @@ static int __init fat_init_inodecache(void) fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/file.c b/fs/file.c index 1aed0add16a2..1fbc5c0555a9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); + void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | + __GFP_NOWARN | __GFP_NORETRY); if (data != NULL) return data; } - return vmalloc(size); + return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) @@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5b99..4d69d5c0bedc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 1d709d496364..f99f8e94de3f 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -114,7 +114,8 @@ static int __init init_gfs2_fs(void) gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD| + SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) goto fail; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4574fdd3d421..1ca95c232bb5 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -483,8 +483,8 @@ static int __init init_hfs_fs(void) int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", - sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, - hfs_init_once); + sizeof(struct hfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7302d96ae8bf..5d54490a136d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void) int err; hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", - HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index f49be23e78aa..cfaa18c7a337 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a561591896bd..458cf463047b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -261,7 +261,7 @@ static int init_inodecache(void) hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d8f51ee8126b..47789292a582 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -4,11 +4,11 @@ * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. + * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ @@ -738,7 +738,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * The policy is initialized here even if we are creating a * private inode because initialization simply creates an - * an empty rb tree and calls spin_lock_init(), later when we + * an empty rb tree and calls rwlock_init(), later when we * call mpol_free_shared_policy() it will just return because * the rb tree will still be empty. */ @@ -1202,7 +1202,6 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1322,7 +1321,7 @@ static int __init init_hugetlbfs_fs(void) error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), - 0, 0, init_once); + 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out2; @@ -1356,26 +1355,4 @@ static int __init init_hugetlbfs_fs(void) out2: return error; } - -static void __exit exit_hugetlbfs_fs(void) -{ - struct hstate *h; - int i; - - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(hugetlbfs_inode_cachep); - i = 0; - for_each_hstate(h) - kern_unmount(hugetlbfs_vfsmount[i++]); - unregister_filesystem(&hugetlbfs_fs_type); -} - -module_init(init_hugetlbfs_fs) -module_exit(exit_hugetlbfs_fs) - -MODULE_LICENSE("GPL"); +fs_initcall(init_hugetlbfs_fs) diff --git a/fs/inode.c b/fs/inode.c index 4230f66b7410..e491e54d2430 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1883,7 +1883,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 61abdc4920da..bcd2d41b318a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -94,7 +94,7 @@ static int __init init_inodecache(void) isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (isofs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d86c5e3176a1..bb080c272149 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void) jffs2_inode_cachep = kmem_cache_create("jffs2_i", sizeof(struct jffs2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), jffs2_i_init_once); if (!jffs2_inode_cachep) { pr_err("error: Failed to initialise inode cache\n"); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index f3a4857ff071..5a3da3f52908 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work) { struct delayed_work *dwork; - dwork = container_of(work, struct delayed_work, work); + dwork = to_delayed_work(work); return container_of(dwork, struct jffs2_sb_info, wbuf_dwork); } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8f9176caf098..900925b5eb8c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -898,7 +898,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 91e004518237..821973853340 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - /* - * If the ino of the sysfs entry created for a kmem cache gets - * allocated from an ida layer, which is accounted to the memcg that - * owns the cache, the memcg will get pinned forever. So do not account - * ino ida allocations. - */ - ret = ida_simple_get(&root->ino_ida, 1, 0, - GFP_KERNEL | __GFP_NOACCOUNT); + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); if (ret < 0) goto err_out2; kn->ino = ret; @@ -694,6 +687,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, return NULL; } +static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, + const unsigned char *path, + const void *ns) +{ + static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */ + size_t len = strlcpy(path_buf, path, PATH_MAX); + char *p = path_buf; + char *name; + + lockdep_assert_held(&kernfs_mutex); + + if (len >= PATH_MAX) + return NULL; + + while ((name = strsep(&p, "/")) && parent) { + if (*name == '\0') + continue; + parent = kernfs_find_ns(parent, name, ns); + } + + return parent; +} + /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under @@ -719,6 +735,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** + * kernfs_walk_and_get_ns - find and get kernfs_node with the given path + * @parent: kernfs_node to search under + * @path: path to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with path @path under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_walk_ns(parent, path, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} + +/** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index 09ed066c0221..2b4503163930 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@ config LOGFS tristate "LogFS file system" - depends on (MTD || BLOCK) + depends on MTD || (!MTD && BLOCK) select ZLIB_INFLATE select ZLIB_DEFLATE select CRC32 diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 0fce46d62b9c..db9cfc598883 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -409,7 +409,8 @@ const struct super_operations logfs_super_operations = { int logfs_init_inode_cache(void) { logfs_inode_cache = kmem_cache_create("logfs_inode_cache", - sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + sizeof(struct logfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, logfs_init_once); if (!logfs_inode_cache) return -ENOMEM; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index cb1789ca1ee6..f975d667c539 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -91,7 +91,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index ce1eb3f9dfe8..1af15fcbe57b 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -82,7 +82,7 @@ static int init_inodecache(void) ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", sizeof(struct ncp_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ncp_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 807eb6ef4f91..f0939d097406 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, + &args->stateid, -ntohl(res)); goto out; + } /* Set up a helper thread to actually return the delegation */ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: @@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, default: res = htonl(NFS4ERR_RESOURCE); } - trace_nfs4_recall_delegation(inode, -ntohl(res)); + trace_nfs4_cb_recall(cps->clp, &args->fh, inode, + &args->stateid, -ntohl(res)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); @@ -160,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, return lo; } +/* + * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) + */ +static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + + if (newseq > oldseq + 1) + return false; + return true; +} + static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { @@ -169,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); - if (!lo) + if (!lo) { + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, + &args->cbl_stateid, -rv); goto out; + } ino = lo->plh_inode; spin_lock(&ino->i_lock); + if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { + rv = NFS4ERR_DELAY; + goto unlock; + } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) { + /* + * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) + */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { rv = NFS4ERR_DELAY; goto unlock; } + if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, + &args->cbl_range)) { + rv = NFS4_OK; + goto unlock; + } + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } + pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); - trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, + &args->cbl_stateid, -rv); iput(ino); out: return rv; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8a0530921685..c82a21228a34 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2431,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) } EXPORT_SYMBOL_GPL(nfs_may_open); +static int nfs_execute_ok(struct inode *inode, int mask) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (mask & MAY_NOT_BLOCK) + ret = nfs_revalidate_inode_rcu(server, inode); + else + ret = nfs_revalidate_inode(server, inode); + if (ret == 0 && !execute_ok(inode)) + ret = -EACCES; + return ret; +} + int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; @@ -2448,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask) case S_IFLNK: goto out; case S_IFREG: + if ((mask & MAY_OPEN) && + nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) + return 0; break; case S_IFDIR: /* @@ -2480,8 +2497,8 @@ force_lookup: res = PTR_ERR(cred); } out: - if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) - res = -EACCES; + if (!res && (mask & MAY_EXEC)) + res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4b1d08f56aba..7ab7ec9f4eed 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } -void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) -{ - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -} -EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); - static void nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) { @@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) req = nfs_list_entry(reqs.next); nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + list_splice_init(&reqs, &failed); + goto out_failed; + } list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { @@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_list_add_request(req, &failed); spin_lock(cinfo.lock); dreq->flags = 0; - dreq->error = -EIO; + if (desc.pg_error < 0) + dreq->error = desc.pg_error; + else + dreq->error = -EIO; spin_unlock(cinfo.lock); } nfs_release_request(req); } nfs_pageio_complete(&desc); +out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_direct_write_complete(dreq, data->inode); } -static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - /* There is no lock to clear */ + struct nfs_direct_req *dreq = cinfo->dreq; + + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_mark_request_commit(req, NULL, cinfo, 0); } static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { .completion = nfs_direct_commit_complete, - .error_cleanup = nfs_direct_error_cleanup, + .resched_write = nfs_direct_resched_write, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) @@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head) } } +static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + + spin_lock(&dreq->lock); + if (dreq->error == 0) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = hdr->args.count; + } + spin_unlock(&dreq->lock); +} + static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .error_cleanup = nfs_write_sync_pgio_error, .init_hdr = nfs_direct_pgio_init, .completion = nfs_direct_write_completion, + .reschedule_io = nfs_direct_write_reschedule_io, }; @@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + nfs_free_request(req); + result = desc.pg_error; + break; + } nfs_lock_request(req); req->wb_index = pos >> PAGE_SHIFT; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 93e236429c5d..4ef8f5addcad 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page, * so it will not block due to pages that will shortly be freeable. */ nfsi = NFS_I(mapping->host); - if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + if (atomic_read(&nfsi->commit_info.rpcs_out)) { *writeback = true; return; } @@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_page(inode, page); + return nfs_wb_launder_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -756,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { - status = nfs_iocounter_wait(&l_ctx->io_count); + status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); if (status < 0) return status; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 02ec07973bc4..bb1f4e7a3270 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, task->tk_status); nfs4_mark_deviceid_unavailable(devid); pnfs_error_mark_layout_for_return(inode, lseg); + pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: @@ -883,13 +884,19 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_read_mds(pgio); @@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } + /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 03516c80855a..6594e9f903a0 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return false; for (i = 0; i < m1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; i++) { + for (j = 0; j < m2->fh_versions_cnt; j++) { if (nfs_compare_fh(&m1->fh_versions[i], &m2->fh_versions[j]) == 0) { found_fh = true; @@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, } p = xdr_inline_decode(&stream, 4); - if (p) - fls->flags = be32_to_cpup(p); + if (!p) + goto out_sort_mirrors; + fls->flags = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_sort_mirrors; + for (i=0; i < fls->mirror_array_cnt; i++) + fls->mirror_array[i]->report_interval = be32_to_cpup(p); +out_sort_mirrors: ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, mirror->start_time = now; if (ktime_equal(mirror->last_report_time, notime)) mirror->last_report_time = now; - if (layoutstats_timer != 0) + if (mirror->report_interval != 0) + report_interval = (s64)mirror->report_interval * 1000LL; + else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= report_interval) { @@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -867,18 +889,25 @@ static unsigned int ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out; + } + } if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); +out: return 1; } @@ -912,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); - if (!hdr->dreq) { - struct nfs_open_context *ctx; - - ctx = nfs_list_entry(hdr->pages.next)->wb_context; - set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - hdr->completion_ops->error_cleanup(&hdr->pages); - } else { - nfs_direct_set_resched_writes(hdr->dreq); - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.count; - } + hdr->completion_ops->reschedule_io(hdr); return; } @@ -1101,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, return -NFS4ERR_RESET_TO_PNFS; out_retry: task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); return -EAGAIN; } @@ -1159,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } } + switch (status) { + case NFS4ERR_DELAY: + case NFS4ERR_GRACE: + return; + default: + break; + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, @@ -1242,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) return ff_layout_test_devid_unavailable(node); } -static int ff_layout_read_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_read(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, + hdr->res.count); +} +static int ff_layout_read_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1265,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, } hdr->pgio_done_cb = ff_layout_read_done_cb; + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1323,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1341,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_read_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); } +static void ff_layout_read_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + + static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1362,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, true); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: - rpc_restart_call_prepare(task); return -EAGAIN; } @@ -1402,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1421,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return 0; } -static int ff_layout_write_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_write(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); +} +static int ff_layout_write_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1445,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EAGAIN; } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1480,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1499,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_write_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, +static void ff_layout_write_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + +static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; nfs4_ff_layout_stat_io_start_write(cdata->inode, FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), 0, task->tk_start); } +static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + struct nfs_page *req; + __u64 count = 0; + + if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); +} + +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + ff_layout_commit_record_layoutstats_start(task, cdata); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { ff_layout_commit_prepare_common(task, data); @@ -1531,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) static void ff_layout_commit_done(struct rpc_task *task, void *data) { - struct nfs_commit_data *cdata = data; - struct nfs_page *req; - __u64 count = 0; - - if (task->tk_status == 0) { - list_for_each_entry(req, &cdata->pages, wb_list) - count += req->wb_bytes; - } - - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), - count, count, NFS_FILE_SYNC); - pnfs_generic_write_commit_done(task, data); } @@ -1551,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) { struct nfs_commit_data *cdata = data; + ff_layout_commit_record_layoutstats_done(task, cdata); rpc_count_iostats_metrics(task, &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); } +static void ff_layout_commit_release(void *data) +{ + struct nfs_commit_data *cdata = data; + + ff_layout_commit_record_layoutstats_done(&cdata->task, cdata); + pnfs_generic_commit_release(data); +} + static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static enum pnfs_try_status diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 2bb08bc6aaf0..dd353bb7dc0a 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat write_stat; ktime_t start_time; ktime_t last_report_time; + u32 report_interval; }; struct nfs4_ff_layout_segment { diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e125e55de86d..bd0327541366 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -429,22 +429,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (fail_return) { - pnfs_error_mark_layout_for_return(ino, lseg); - if (ff_layout_has_available_ds(lseg)) - pnfs_set_retry_layoutget(lseg->pls_layout); - else - pnfs_clear_retry_layoutget(lseg->pls_layout); - - } else { + if (!fail_return) { if (ff_layout_has_available_ds(lseg)) set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lseg->pls_layout->plh_flags); - else { + else pnfs_error_mark_layout_for_return(ino, lseg); - pnfs_clear_retry_layoutget(lseg->pls_layout); - } - } + } else + pnfs_error_mark_layout_for_return(ino, lseg); } out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bdb4dc7b4ecd..8e24d886d2c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -/** - * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks - * @word: long word containing the bit lock - */ -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +static int nfs_wait_killable(int mode) { freezable_schedule_unsafe(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } + +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +{ + return nfs_wait_killable(mode); +} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); +int nfs_wait_atomic_killable(atomic_t *p) +{ + return nfs_wait_killable(TASK_KILLABLE); +} + /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid @@ -700,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); - nfs_iocounter_init(&l_ctx->io_count); + atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) @@ -913,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + /* + * We fatal error on write before. Try to writeback + * every page again. + */ + if (ctx->error < 0) + invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -1663,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1724,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else + } else { nfsi->cache_validity |= save_cache_validity; + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - } else if (server->caps & NFS_CAP_MTIME) + } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - } else if (server->caps & NFS_CAP_CTIME) + } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1759,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)cur_isize, (long long)new_isize); } - } else + } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); - else if (server->caps & NFS_CAP_ATIME) + else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { @@ -1780,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } - } else if (server->caps & NFS_CAP_MODE) + } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } else if (server->caps & NFS_CAP_OWNER) + } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } else if (server->caps & NFS_CAP_OWNER_GROUP) + } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1818,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); } - } else if (server->caps & NFS_CAP_NLINK) + } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATTR | NFS_INO_REVAL_FORCED); + cache_revalidated = false; + } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; + else + cache_revalidated = false; /* Update attrtimeo value if we're out of the unstable period */ if (invalid & NFS_INO_INVALID_ATTR) { @@ -1840,9 +1872,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { - if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { - if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) - nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + if (cache_revalidated) { + if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, + nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { + nfsi->attrtimeo <<= 1; + if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode)) + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); + } nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ @@ -1851,7 +1887,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* Don't declare attrcache up to date if there were no attrs! */ - if (fattr->valid != 0) + if (cache_revalidated) invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ @@ -1933,7 +1969,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9dea85f7f918..4e8cc942336c 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); -int nfs_iocounter_wait(struct nfs_io_counter *c); +int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); @@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline void nfs_iocounter_init(struct nfs_io_counter *c) -{ - c->flags = 0; - atomic_set(&c->io_count, 0); -} - static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) { WARN_ON_ONCE(desc->pg_mirror_count < 1); return desc->pg_mirror_count > 1; } +static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, @@ -380,6 +380,7 @@ extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); +extern int nfs_wait_atomic_killable(atomic_t *p); /* super.c */ extern const struct super_operations nfs_sops; @@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode) inode_dio_wait(inode); } extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); -extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); @@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } +static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) +{ + return ~crc32_le(0xFFFFFFFF, &stateid->other[0], + NFS4_STATEID_OTHER_SIZE); +} #else static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return 0; } +static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) +{ + return 0; +} #endif + +static inline bool nfs_error_is_fatal(int err) +{ + switch (err) { + case -ERESTARTSYS: + case -EIO: + case -ENOSPC: + case -EROFS: + case -E2BIG: + return true; + default: + return false; + } +} diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6b1ce9825430..6e8174930a48 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -204,6 +204,8 @@ static void nfs42_layoutstat_done(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -211,12 +213,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match(&data->args.stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + } else + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: - NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; default: - dprintk("%s server returns %d\n", __func__, task->tk_status); + break; } + + dprintk("%s server returns %d\n", __func__, task->tk_status); } static void diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c57d1332c1c8..4bfc33ad0563 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -208,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + | FATTR4_WORD2_SECURITY_LABEL +#endif }; static const u32 nfs4_open_noattr_bitmap[3] = { @@ -1385,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ + spin_lock(&state->owner->so_lock); write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { nfs4_stateid_copy(&state->stateid, deleg_stateid); @@ -1393,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s if (open_stateid != NULL) nfs_set_open_stateid_locked(state, open_stateid, fmode); write_sequnlock(&state->seqlock); - spin_lock(&state->owner->so_lock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } @@ -1598,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) if (!data->rpc_done) { state = nfs4_try_open_cached(data); + trace_nfs4_cached_open(data->state); goto out; } @@ -2015,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) } return; unlock_no_action: + trace_nfs4_cached_open(data->state); rcu_read_unlock(); out_no_action: task->tk_action = NULL; @@ -2703,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); + trace_nfs4_setattr(inode, &arg.stateid, status); return status; } @@ -2719,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, int err; do { err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); - trace_nfs4_setattr(inode, err); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -5048,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, static int nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5076,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", clp->cl_ipaddr, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); @@ -5089,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) static int nfs4_init_uniquifier_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5109,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + scnprintf(str, len, "Linux NFSv%u.%u %s/%s", clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, clp->cl_rpcclient->cl_nodename); @@ -5120,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) static int nfs4_init_uniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5145,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s", + scnprintf(str, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, clp->cl_rpcclient->cl_nodename); clp->cl_owner_id = str; @@ -5384,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (data == NULL) return -ENOMEM; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + + nfs4_state_protect(server->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->cache_consistency_bitmask; @@ -5426,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int err; do { err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); - trace_nfs4_delegreturn(inode, err); + trace_nfs4_delegreturn(inode, stateid, err); switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -5936,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->cancelled = 1; rpc_put_task(task); dprintk("%s: done, ret = %d!\n", __func__, ret); + trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret); return ret; } @@ -5952,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - trace_nfs4_lock_reclaim(request, state, F_SETLK, err); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -5979,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); - trace_nfs4_lock_expired(request, state, F_SETLK, err); switch (err) { default: goto out; @@ -6087,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * do { err = _nfs4_proc_setlk(state, cmd, request); - trace_nfs4_set_lock(request, state, cmd, err); if (err == -NFS4ERR_DENIED) err = -EAGAIN; err = nfs4_handle_exception(NFS_SERVER(state->inode), @@ -6847,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { }, .allow.u.words = { [0] = 1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN) | 1 << (OP_COMMIT), [1] = 1 << (OP_SECINFO - 32) | 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_LAYOUTRETURN - 32) | 1 << (OP_TEST_STATEID - 32) | 1 << (OP_FREE_STATEID - 32) | 1 << (OP_WRITE - 32) @@ -6915,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, } if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) && + test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); } + if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { + dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &clp->cl_sp4_flags); + } + if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); @@ -7748,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); + int ret; dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -7758,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; - if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, &lgp->args.range, - lgp->args.ctx->state)) { - rpc_exit(task, NFS4_OK); - } + lgp->args.ctx->state); + if (ret < 0) + rpc_exit(task, ret); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) @@ -7783,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + + /* + * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs + * on the file. set tk_status to -ENODATA to tell upper layer to + * retry go inband. + */ + case -NFS4ERR_LAYOUTUNAVAILABLE: + task->tk_status = -ENODATA; + goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). @@ -7979,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, + &lgp->res.stateid, status); /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) @@ -8035,11 +8062,11 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); @@ -8071,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) }; int status = 0; + nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client, + NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &task_setup_data.rpc_client, &msg); + dprintk("--> %s\n", __func__); if (!sync) { lrp->inode = nfs_igrab_and_active(lrp->args.inode); @@ -8086,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutreturn(lrp->args.inode, status); + trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status); dprintk("<-- %s status=%d\n", __func__, status); rpc_put_task(task); return status; @@ -8234,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutcommit(data->args.inode, status); + trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status); dprintk("%s: status %d\n", __func__, status); rpc_put_task(task); return status; diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0fbd3ab1be22..8693d77c45ea 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -12,7 +12,7 @@ #include "nfs4idmap.h" #include "callback.h" -static const int nfs_set_port_min = 0; +static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index d774335cc8bc..2850bce19244 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -6,6 +6,7 @@ #include "internal.h" #include "nfs4session.h" #include "callback.h" +#include "pnfs.h" #define CREATE_TRACE_POINTS #include "nfs4trace.h" diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 671cf68fe56b..2c8d05dae5b1 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done, __entry->highest_slotid = res->sr_highest_slotid; __entry->target_highest_slotid = res->sr_target_highest_slotid; + __entry->status_flags = res->sr_status_flags; __entry->error = res->sr_status; ), TP_printk( @@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __field(u64, fileid) __field(u64, dir) __string(name, ctx->dentry->d_name.name) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, openstateid_seq) + __field(u32, openstateid_hash) ), TP_fast_assign( @@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->flags = flags; __entry->fmode = (__force unsigned int)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; - if (!IS_ERR_OR_NULL(state)) + if (!IS_ERR_OR_NULL(state)) { inode = state->inode; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->openstateid_seq = + be32_to_cpu(state->open_stateid.seqid); + __entry->openstateid_hash = + nfs_stateid_hash(&state->open_stateid); + } else { + __entry->stateid_seq = 0; + __entry->stateid_hash = 0; + __entry->openstateid_seq = 0; + __entry->openstateid_hash = 0; + } if (inode != NULL) { __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event, TP_printk( "error=%d (%s) flags=%d (%s) fmode=%s " "fileid=%02x:%02x:%llu fhandle=0x%08x " - "name=%02x:%02x:%llu/%s", + "name=%02x:%02x:%llu/%s stateid=%d:0x%08x " + "openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->flags, @@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->stateid_seq, __entry->stateid_hash, + __entry->openstateid_seq, __entry->openstateid_hash ) ); @@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); +TRACE_EVENT(nfs4_cached_open, + TP_PROTO( + const struct nfs4_state *state + ), + TP_ARGS(state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x stateid=%d:0x%08x", + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + TRACE_EVENT(nfs4_close, TP_PROTO( const struct nfs4_state *state, @@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close, __field(u64, fileid) __field(unsigned int, fmode) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close, __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->stateid); ), TP_printk( "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " - "fhandle=0x%08x", + "fhandle=0x%08x openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->fmode ? show_fmode_flags(__entry->fmode) : "closed", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) cmd=%s:%s range=%lld:%lld " - "fileid=%02x:%02x:%llu fhandle=0x%08x", + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), show_lock_cmd(__entry->cmd), @@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, (long long)__entry->end, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, ), \ TP_ARGS(request, state, cmd, error)) DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); +TRACE_EVENT(nfs4_set_lock, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + const nfs4_stateid *lockstateid, + int cmd, + int error + ), + + TP_ARGS(request, state, lockstateid, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, lockstateid_seq) + __field(u32, lockstateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->lockstateid_seq = + be32_to_cpu(lockstateid->seqid); + __entry->lockstateid_hash = + nfs_stateid_hash(lockstateid); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x lockstateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __entry->lockstateid_seq, __entry->lockstateid_hash + ) +); + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, @@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit, __field(dev_t, dev) __field(u32, fhandle) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( __entry->dev = res->server->s_dev; __entry->fhandle = nfs_fhandle_hash(args->fhandle); __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(args->stateid); ), TP_printk( - "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, ), \ TP_ARGS(inode, error)) -DEFINE_NFS4_INODE_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_EVENT(nfs4_access); DEFINE_NFS4_INODE_EVENT(nfs4_readlink); DEFINE_NFS4_INODE_EVENT(nfs4_readdir); @@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); -DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, + TP_PROTO( + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(inode, stateid, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + +#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(inode, stateid, error)) + +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, ), \ TP_ARGS(clp, fhandle, inode, error)) DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); -DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); +DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(clp, fhandle, inode, stateid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, stateid, error)) +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( @@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); #define DEFINE_NFS4_READ_EVENT(name) \ @@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget, const struct nfs_open_context *ctx, const struct pnfs_layout_range *args, const struct pnfs_layout_range *res, + const nfs4_stateid *layout_stateid, int error ), - TP_ARGS(ctx, args, res, error), + TP_ARGS(ctx, args, res, layout_stateid, error), TP_STRUCT__entry( __field(dev_t, dev) @@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget, __field(u64, offset) __field(u64, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = d_inode(ctx->dentry); + const struct nfs4_state *state = ctx->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget, __entry->offset = args->offset; __entry->count = args->length; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + if (!error) { + __entry->layoutstateid_seq = + be32_to_cpu(layout_stateid->seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(layout_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "iomode=%s offset=%llu count=%llu", + "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget, __entry->fhandle, show_pnfs_iomode(__entry->iomode), (unsigned long long)__entry->offset, - (unsigned long long)__entry->count + (unsigned long long)__entry->count, + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); +#define show_pnfs_update_layout_reason(reason) \ + __print_symbolic(reason, \ + { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \ + { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \ + { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \ + { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \ + { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \ + { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \ + { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \ + { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ + { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ + { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + +TRACE_EVENT(pnfs_update_layout, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + enum pnfs_update_layout_reason reason + ), + TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(enum pnfs_update_layout_reason, reason) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + __entry->reason = reason; + if (lo != NULL) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x (%s)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + show_pnfs_update_layout_reason(__entry->reason) + ) +); + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 59f838cdc009..9f80a086b612 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_COMMIT, "COMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 452a011ba0d8..8ce4f61cbaa5 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } -static void -nfs_iocounter_inc(struct nfs_io_counter *c) -{ - atomic_inc(&c->io_count); -} - -static void -nfs_iocounter_dec(struct nfs_io_counter *c) -{ - if (atomic_dec_and_test(&c->io_count)) { - clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_atomic(); - wake_up_bit(&c->flags, NFS_IO_INPROGRESS); - } -} - -static int -__nfs_iocounter_wait(struct nfs_io_counter *c) -{ - wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); - DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); - int ret = 0; - - do { - prepare_to_wait(wq, &q.wait, TASK_KILLABLE); - set_bit(NFS_IO_INPROGRESS, &c->flags); - if (atomic_read(&c->io_count) == 0) - break; - ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); - } while (atomic_read(&c->io_count) != 0 && !ret); - finish_wait(wq, &q.wait); - return ret; -} - /** * nfs_iocounter_wait - wait for i/o to complete - * @c: nfs_io_counter to use + * @l_ctx: nfs_lock_context with io_counter to use * * returns -ERESTARTSYS if interrupted by a fatal signal. * Otherwise returns 0 once the io_count hits 0. */ int -nfs_iocounter_wait(struct nfs_io_counter *c) +nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - if (atomic_read(&c->io_count) == 0) - return 0; - return __nfs_iocounter_wait(c); + return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, + TASK_KILLABLE); } /* @@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; - nfs_iocounter_inc(&l_ctx->io_count); + atomic_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - nfs_iocounter_dec(&l_ctx->io_count); + if (atomic_dec_and_test(&l_ctx->io_count)) + wake_up_atomic_t(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); * @desc: IO descriptor * @hdr: pageio header */ -static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) +static void nfs_pgio_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_mirror *mirror; - u32 midx; - set_bit(NFS_IOHDR_REDO, &hdr->flags); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); - /* TODO: Make sure it's right to clean up all mirrors here - * and not just hdr->pgio_mirror_idx */ - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - } - return -ENOMEM; } /** @@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, unsigned int pagecount, pageused; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) - return nfs_pgio_error(desc, hdr); + if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); pages = hdr->page_array.pagevec; @@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *pages++ = last_page = req->wb_page; } } - if (WARN_ON_ONCE(pageused != pagecount)) - return nfs_pgio_error(desc, hdr); + if (WARN_ON_ONCE(pageused != pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -EINVAL; + return desc->pg_error; + } if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) @@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio); static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror; struct nfs_pgio_header *hdr; int ret; - mirror = nfs_pgio_current_mirror(desc); - hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - /* TODO: make sure this is right with mirroring - or - * should it back out all mirrors? */ - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); @@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (pgio->pg_error < 0) + return pgio->pg_error; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) return -EINVAL; @@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) pgio->pg_mirrors_dynamic = NULL; } -static bool nfs_match_open_context(const struct nfs_open_context *ctx1, - const struct nfs_open_context *ctx2) -{ - return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; -} - static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { @@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); + if (desc->pg_error < 0) + return 0; mirror->pg_base = req->wb_pgbase; } if (!nfs_can_coalesce_requests(prev, req, desc)) @@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, bytes = req->wb_bytes; nfs_pageio_setup_mirroring(desc, req); + if (desc->pg_error < 0) + goto out_failed; for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { @@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (IS_ERR(dupreq)) { nfs_page_group_unlock(req); - return 0; + desc->pg_error = PTR_ERR(dupreq); + goto out_failed; } nfs_lock_request(dupreq); @@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - return 0; + goto out_failed; } return 1; + +out_failed: + /* + * We might have failed before sending any reqs over wire. + * Clean up rest of the reqs in mirror pg_list. + */ + if (desc->pg_error) { + struct nfs_pgio_mirror *mirror; + void (*func)(struct list_head *); + + /* remember fatal errors */ + if (nfs_error_is_fatal(desc->pg_error)) + mapping_set_error(desc->pg_inode->i_mapping, + desc->pg_error); + + func = desc->pg_completion_ops->error_cleanup; + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + func(&mirror->pg_list); + } + } + return 0; } /* @@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, nfs_pageio_complete(desc); if (!list_empty(&failed)) { list_move(&failed, &hdr->pages); - return -EIO; + return desc->pg_error < 0 ? desc->pg_error : -EIO; } return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bec0384499f7..a3592cc34a20 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock); static LIST_HEAD(pnfs_modules_tbl); static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync); /* Return the registered pnfs layout driver module matching given id */ @@ -385,13 +385,13 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, enum pnfs_iomode iomode; bool send; - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, iomode, false); } } else spin_unlock(&inode->i_lock); @@ -566,10 +566,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; - int invalid = 0, removed = 0; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - invalid++; - removed += mark_lseg_invalid(lseg, tmp_list); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; } - dprintk("%s:Return %i\n", __func__, invalid - removed); - return invalid - removed; + dprintk("%s:Return %i\n", __func__, remaining); + return remaining; } /* note free_me must contain lsegs from a single layout_hdr */ @@ -618,7 +618,6 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_get_layout_hdr(lo); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); - pnfs_clear_retry_layoutget(lo); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -703,6 +702,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, ret = -EAGAIN; spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); iput(inode); } @@ -826,7 +827,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; @@ -861,7 +862,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; @@ -894,7 +895,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.minlength = i_size - range->offset; } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range = *range; + pnfs_copy_range(&lgp->args.range, range); lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); @@ -904,17 +905,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, lseg = nfs4_proc_layoutget(lgp, gfp_flags); } while (lseg == ERR_PTR(-EAGAIN)); - if (IS_ERR(lseg)) { - switch (PTR_ERR(lseg)) { - case -ENOMEM: - case -ERESTARTSYS: - break; - default: - /* remember that LAYOUTGET failed and suspend trying */ - pnfs_layout_io_set_failed(lo, range->iomode); - } - return NULL; - } else + if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) + lseg = NULL; + else pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(range->iomode)); @@ -945,7 +938,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; @@ -962,7 +955,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, goto out; } - lrp->args.stateid = stateid; + nfs4_stateid_copy(&lrp->args.stateid, stateid); lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; lrp->args.range.iomode = iomode; @@ -1005,7 +998,7 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - stateid = nfsi->layout->plh_stateid; + nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1033,7 +1026,7 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: pnfs_put_layout_hdr(lo); out: @@ -1096,13 +1089,12 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags)) layoutreturn = pnfs_prepare_layoutreturn(lo); - pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { @@ -1124,7 +1116,7 @@ out_noroc: pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); if (layoutreturn) - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); return roc; } @@ -1149,6 +1141,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; + pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1465,25 +1458,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, return ret; } -/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) -{ - if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) - return 1; - return nfs_wait_bit_killable(key, mode); -} - static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { - if (!pnfs_should_retry_layoutget(lo)) - return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference */ pnfs_layoutcommit_inode(lo->plh_inode, false); return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, - pnfs_layoutget_retry_bit_wait, + nfs_wait_bit_killable, TASK_UNINTERRUPTIBLE); } @@ -1520,14 +1503,23 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; bool first; - if (!pnfs_enabled_sb(NFS_SERVER(ino))) + if (!pnfs_enabled_sb(NFS_SERVER(ino))) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; + } - if (iomode == IOMODE_READ && i_size_read(ino) == 0) + if (iomode == IOMODE_READ && i_size_read(ino) == 0) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; + } - if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; + } lookup_again: first = false; @@ -1535,19 +1527,25 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } /* if LAYOUTGET already failed once we don't try again */ - if (pnfs_layout_io_test_failed(lo, iomode) && - !pnfs_should_retry_layoutget(lo)) + if (pnfs_layout_io_test_failed(lo, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; + } first = list_empty(&lo->plh_segs); if (first) { @@ -1567,8 +1565,11 @@ lookup_again: * already exists */ lseg = pnfs_find_lseg(lo, &arg); - if (lseg) + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); goto out_unlock; + } } /* @@ -1585,11 +1586,16 @@ lookup_again: dprintk("%s retrying\n", __func__); goto lookup_again; } + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo)) + if (pnfs_layoutgets_blocked(lo)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; + } atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1612,8 +1618,9 @@ lookup_again: arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - pnfs_clear_retry_layoutget(lo); atomic_dec(&lo->plh_outstanding); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1623,7 +1630,7 @@ out: "(%s, offset: %llu, length: %llu)\n", __func__, ino->i_sb->s_id, (unsigned long long)NFS_FILEID(ino), - lseg == NULL ? "not found" : "found", + IS_ERR_OR_NULL(lseg) ? "not found" : "found", iomode==IOMODE_RW ? "read/write" : "read-only", (unsigned long long)pos, (unsigned long long)count); @@ -1730,16 +1737,29 @@ out_forget_reply: } static void +pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +{ + if (lo->plh_return_iomode == iomode) + return; + if (lo->plh_return_iomode != 0) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; +} + +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range) { struct pnfs_layout_segment *lseg, *next; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) - return; + return 0; + + assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(&lseg->pls_range, return_range)) { @@ -1749,38 +1769,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.offset, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - mark_lseg_invalid(lseg, tmp_list); + pnfs_set_plh_return_iomode(lo, return_range->iomode); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); } + return remaining; } void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg) { struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); struct pnfs_layout_range range = { .iomode = lseg->pls_range.iomode, .offset = 0, .length = NFS4_MAX_UINT64, }; LIST_HEAD(free_me); + bool return_now = false; spin_lock(&inode->i_lock); - /* set failure bit so that pnfs path will be retried later */ - pnfs_layout_set_fail_bit(lo, iomode); - if (lo->plh_return_iomode == 0) - lo->plh_return_iomode = range.iomode; - else if (lo->plh_return_iomode != range.iomode) - lo->plh_return_iomode = IOMODE_ANY; + pnfs_set_plh_return_iomode(lo, range.iomode); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - pnfs_mark_matching_lsegs_return(lo, &free_me, &range); - spin_unlock(&inode->i_lock); + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode = lo->plh_return_iomode; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + return_now = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (return_now) + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } else { + spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); + } pnfs_free_lseg_list(&free_me); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); @@ -1802,6 +1831,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r rd_size, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) @@ -1814,13 +1848,19 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - if (pgio->pg_lseg == NULL) + if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), wb_size, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_write_mds(pgio); @@ -1988,15 +2028,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); @@ -2119,15 +2157,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d1990e90e7a0..9f4e2a47f4aa 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -98,7 +98,6 @@ enum { NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ - NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */ }; enum layoutdriver_policy_flags { @@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range); +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + const struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d) return d; } -static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); -} - -static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) { - atomic_dec(&lo->plh_refcount); - /* wake up waiters for LAYOUTRETURN as that is not needed */ - wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); - } -} - -static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags); -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) return lseg; } +static inline bool +pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg) +{ + return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } +/** + * pnfs_mark_layout_returned_if_empty - marks the layout as returned + * @lo: layout header + * + * Note: Caller must hold inode->i_lock + */ +static inline void +pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) +{ + if (list_empty(&lo->plh_segs)) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); +} + +static inline void +pnfs_copy_range(struct pnfs_layout_range *dst, + const struct pnfs_layout_range *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 24655b807d44..81ac6480f9e7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } else { nfs_retry_commit(mds_pages, NULL, cinfo, 0); pnfs_generic_retry_commit(cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - if (nreq == 0) { - cinfo->completion_ops->error_cleanup(NFS_I(inode)); + if (nreq == 0) goto out; - } atomic_add(nreq, &cinfo->mds->rpcs_out); @@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { + if (!pnfs_is_valid_lseg(lseg)) { + spin_unlock(cinfo->lock); + cinfo->completion_ops->resched_write(cinfo, req); + return; + } /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0a5e33f33b5c..eb31e23e7def 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +static void nfs_readpage_release(struct nfs_page *req) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + + dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), req->wb_bytes, + (long long)req_offset(req)); + + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } + nfs_release_request(req); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - nfs_pageio_add_request(&pgio, new); + if (!nfs_pageio_add_request(&pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); + } nfs_pageio_complete(&pgio); /* It doesn't make sense to do mirrored reads! */ @@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, pgm = &pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; - return 0; -} - -static void nfs_readpage_release(struct nfs_page *req) -{ - struct inode *inode = d_inode(req->wb_context->dentry); - - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); - - unlock_page(req->wb_page); - } - nfs_release_request(req); + return pgio.pg_error < 0 ? pgio.pg_error : 0; } static void nfs_page_group_set_uptodate(struct nfs_page *req) @@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page) if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); error = desc->pgio->pg_error; goto out_unlock; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7b9316406930..ce43cd6d88c6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -21,6 +21,8 @@ #include <linux/nfs_page.h> #include <linux/backing-dev.h> #include <linux/export.h> +#include <linux/freezer.h> +#include <linux/wait.h> #include <asm/uaccess.h> @@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc) { int ret = 0; if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_STABLE; + return FLUSH_HIGHPRI | FLUSH_COND_STABLE; if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; - if (wbc->for_kupdate || wbc->for_background) - ret |= FLUSH_LOWPRI; return ret; } @@ -545,12 +545,22 @@ try_again: return head; } +static void nfs_write_error_remove_page(struct nfs_page *req) +{ + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); + generic_error_remove_page(page_file_mapping(req->wb_page), + req->wb_page); +} + /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page, bool nonblock, + bool launder) { struct nfs_page *req; int ret = 0; @@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = 0; if (!nfs_pageio_add_request(pgio, req)) { - nfs_redirty_request(req); ret = pgio->pg_error; + /* + * Remove the problematic req upon fatal errors + * in launder case, while other dirty pages can + * still be around until they get flushed. + */ + if (nfs_error_is_fatal(ret)) { + nfs_context_set_write_error(req->wb_context, ret); + if (launder) { + nfs_write_error_remove_page(req); + goto out; + } + } + nfs_redirty_request(req); + ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -576,12 +599,14 @@ out: return ret; } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio, bool launder) { int ret; nfs_pageio_cond_complete(pgio, page_file_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, + launder); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +static int nfs_writepage_locked(struct page *page, + struct writeback_control *wbc, + bool launder) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc); + ret = nfs_writepage_locked(page, wbc, false); unlock_page(page); return ret; } @@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(page, wbc, data, false); unlock_page(page); return ret; } @@ -1128,7 +1155,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || + !nfs_match_open_context(req->wb_context, ctx); /* for now, flush if more than 1 request in page_group */ do_flush |= req->wb_this_page != req; if (l_ctx && flctx && @@ -1326,9 +1354,15 @@ static void nfs_async_write_error(struct list_head *head) } } +static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + nfs_async_write_error(&hdr->pages); +} + static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .error_cleanup = nfs_async_write_error, .completion = nfs_write_completion, + .reschedule_io = nfs_async_write_reschedule_io, }; void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -1529,27 +1563,21 @@ static void nfs_writeback_result(struct rpc_task *task, } } - -static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - int ret; + return wait_on_atomic_t(&cinfo->rpcs_out, + nfs_wait_atomic_killable, TASK_KILLABLE); +} - if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) - return 1; - if (!may_wait) - return 0; - ret = out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - return (ret < 0) ? ret : 1; +static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +{ + atomic_inc(&cinfo->rpcs_out); } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) { - clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_atomic(); - wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); + if (atomic_dec_and_test(&cinfo->rpcs_out)) + wake_up_atomic_t(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1666,6 +1694,13 @@ void nfs_retry_commit(struct list_head *page_list, } EXPORT_SYMBOL_GPL(nfs_retry_commit); +static void +nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + /* * Commit dirty pages */ @@ -1687,7 +1722,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, data->mds_ops, how, 0); out_bad: nfs_retry_commit(head, NULL, cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1749,8 +1783,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); - if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commit_end(cinfo.mds); } static void nfs_commit_release(void *calldata) @@ -1769,7 +1802,7 @@ static const struct rpc_call_ops nfs_commit_ops = { static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { .completion = nfs_commit_release_pages, - .error_cleanup = nfs_commit_clear_lock, + .resched_write = nfs_commit_resched_write, }; int nfs_generic_commit_list(struct inode *inode, struct list_head *head, @@ -1788,30 +1821,25 @@ int nfs_commit_inode(struct inode *inode, int how) LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; + int error = 0; int res; - res = nfs_commit_set_lock(NFS_I(inode), may_wait); - if (res <= 0) - goto out_mark_dirty; nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_commit_begin(cinfo.mds); res = nfs_scan_commit(inode, &head, &cinfo); - if (res) { - int error; - + if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); - if (error < 0) - return error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_bit_action(&NFS_I(inode)->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - if (error < 0) - return error; - } else - nfs_commit_clear_lock(NFS_I(inode)); + nfs_commit_end(cinfo.mds); + if (error < 0) + goto out_error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_commit(cinfo.mds); + if (error < 0) + return error; return res; +out_error: + res = error; /* Note: If we exit without ensuring that the commit is complete, * we must mark the inode as dirty. Otherwise, future calls to * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure @@ -1821,6 +1849,7 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } +EXPORT_SYMBOL_GPL(nfs_commit_inode); int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -1911,7 +1940,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); @@ -1928,7 +1957,7 @@ int nfs_wb_page(struct inode *inode, struct page *page) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + ret = nfs_writepage_locked(page, &wbc, launder); if (ret < 0) goto out_error; continue; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c7343844e6b6..7f5d3d9f1c37 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1416,7 +1416,8 @@ static int __init nilfs_init_cachep(void) { nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", sizeof(struct nilfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + nilfs_inode_init_once); if (!nilfs_inode_cachep) goto fail; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index e785fd954c30..741077deef3b 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) break; } spin_unlock(&next_i->i_lock); - next_i = list_entry(next_i->i_sb_list.next, - struct inode, i_sb_list); + next_i = list_next_entry(next_i, i_sb_list); } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index fc0df4442f7b..cfcbf114676e 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -92,9 +92,6 @@ #include "fsnotify.h" struct srcu_struct fsnotify_mark_srcu; -static DEFINE_SPINLOCK(destroy_lock); -static LIST_HEAD(destroy_list); -static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -168,10 +165,19 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) atomic_dec(&group->num_marks); } +static void +fsnotify_mark_free_rcu(struct rcu_head *rcu) +{ + struct fsnotify_mark *mark; + + mark = container_of(rcu, struct fsnotify_mark, g_rcu); + fsnotify_put_mark(mark); +} + /* - * Free fsnotify mark. The freeing is actually happening from a kthread which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. + * Free fsnotify mark. The freeing is actually happening from a call_srcu + * callback. Caller must have a reference to the mark or be protected by + * fsnotify_mark_srcu. */ void fsnotify_free_mark(struct fsnotify_mark *mark) { @@ -186,10 +192,7 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); + call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); /* * Some groups like to know that marks are being freed. This is a @@ -385,11 +388,7 @@ err: spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); - + call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu); return ret; } @@ -492,40 +491,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } - -static int fsnotify_mark_destroy(void *ignored) -{ - struct fsnotify_mark *mark, *next; - struct list_head private_destroy_list; - - for (;;) { - spin_lock(&destroy_lock); - /* exchange the list head */ - list_replace_init(&destroy_list, &private_destroy_list); - spin_unlock(&destroy_lock); - - synchronize_srcu(&fsnotify_mark_srcu); - - list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { - list_del_init(&mark->g_list); - fsnotify_put_mark(mark); - } - - wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); - } - - return 0; -} - -static int __init fsnotify_mark_init(void) -{ - struct task_struct *thread; - - thread = kthread_run(fsnotify_mark_destroy, NULL, - "fsnotify_mark"); - if (IS_ERR(thread)) - panic("unable to start fsnotify mark destruction thread."); - - return 0; -} -device_initcall(fsnotify_mark_init); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d1a853585b53..2f77f8dfb861 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void) ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - ntfs_big_inode_init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 86181d6526dc..a3ded88718c9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); -static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, @@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_value_update_clusters, @@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_tree_update_clusters, @@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &dx_root->dr_list; } -static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, .eo_update_clusters = ocfs2_dx_root_update_clusters, @@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, return CONTIG_NONE; } -static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_refcount_tree_update_clusters, @@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, - struct ocfs2_extent_tree_operations *ops) + const struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; @@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } bail: - if (tl_inode) - iput(tl_inode); + iput(tl_inode); brelse(tl_bh); if (status < 0) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fb09b97db162..f3dc1b0dfffc 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -54,7 +54,7 @@ */ struct ocfs2_extent_tree_operations; struct ocfs2_extent_tree { - struct ocfs2_extent_tree_operations *et_ops; + const struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; struct ocfs2_caching_info *et_ci; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 709fbbd44c65..a3cc6d2fc896 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1780,8 +1780,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e88ccf8c83ff..68c607e63ff6 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -376,17 +376,6 @@ struct dlm_lock lksb_kernel_allocated:1; }; - -#define DLM_LKSB_UNUSED1 0x01 -#define DLM_LKSB_PUT_LVB 0x02 -#define DLM_LKSB_GET_LVB 0x04 -#define DLM_LKSB_UNUSED2 0x08 -#define DLM_LKSB_UNUSED3 0x10 -#define DLM_LKSB_UNUSED4 0x20 -#define DLM_LKSB_UNUSED5 0x40 -#define DLM_LKSB_UNUSED6 0x80 - - enum dlm_lockres_list { DLM_GRANTED_LIST = 0, DLM_CONVERTING_LIST = 1, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 84f2f8079466..9477d6e1de37 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) spin_lock(&res->spinlock); BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); if (test_bit(node, res->refmap)) { - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } @@ -2519,6 +2519,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->master_lock); ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, target, dlm->node_num); + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + */ + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2544,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (oldmle) { + if (ret != -EEXIST && oldmle) { /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, oldmle); dlm_put_mle(oldmle); @@ -2554,6 +2559,7 @@ fail: if (mle_added) { dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); + dlm_put_mle_inuse(mle); } else if (mle) { kmem_cache_free(dlm_mle_cache, mle); mle = NULL; @@ -2571,17 +2577,6 @@ fail: * ensure that all assert_master work is flushed. */ flush_workqueue(dlm->dlm_worker); - /* get an extra reference on the mle. - * otherwise the assert_master from the new - * master will destroy this. - * also, make sure that all callers of dlm_get_mle - * take both dlm->spinlock and dlm->master_lock */ - spin_lock(&dlm->spinlock); - spin_lock(&dlm->master_lock); - dlm_get_mle_inuse(mle); - spin_unlock(&dlm->master_lock); - spin_unlock(&dlm->spinlock); - /* notify new node and send all lock state */ /* call send_one_lockres with migration flag. * this serves as notice to the target node that a @@ -3050,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, int ret = 0; if (!dlm_grab(dlm)) - return -EINVAL; + return 0; name = migrate->name; namelen = migrate->namelen; @@ -3141,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mlog(0, "tried to migrate %.*s, but some " "process beat me to it\n", namelen, name); - ret = -EEXIST; + spin_unlock(&tmp->spinlock); + return -EEXIST; } else { /* bad. 2 NODES are trying to migrate! */ mlog(ML_ERROR, "migration error mle: " @@ -3312,6 +3308,15 @@ top: mle->new_master != dead_node) continue; + if (mle->new_master == dead_node && mle->inuse) { + mlog(ML_NOTICE, "%s: target %u died during " + "migration from %u, the MLE is " + "still keep used, ignore it!\n", + dlm->name, dead_node, + mle->master); + continue; + } + /* If we have reached this point, this mle needs to be * removed from the list and freed. */ dlm_clean_migration_mle(dlm, mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9e4f862d20fe..c5bdf02c213b 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, char *buf = NULL; struct dlm_work_item *item = NULL; struct dlm_lock_resource *res = NULL; + unsigned int hash; if (!dlm_grab(dlm)) return -EINVAL; @@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, /* lookup the lock to see if we have a secondary queue for this * already... just add the locks in and this will have its owner * and RECOVERY flag changed when it completes. */ - res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ @@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; } spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); } else { + spin_unlock(&dlm->spinlock); /* need to allocate, just like if it was * mastered here normally */ res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); @@ -2450,11 +2457,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); - if (test_bit(idx, dlm->recovery_map)) - mlog(0, "domain %s, node %u already added " - "to recovery map!\n", dlm->name, idx); - else - set_bit(idx, dlm->recovery_map); + set_bit(idx, dlm->recovery_map); } void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 2e3c9dbab68c..1082b2c3014b 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } if (!dlm_grab(dlm)) - return DLM_REJECTED; + return DLM_FORWARD; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb18a..03768bb3aab1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 20276e340339..f92612e4b9d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2432,12 +2432,6 @@ bail: * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. - * - * We do a blocking lock and immediate unlock before returning, though, so that - * the lock has a great chance of being cached on this node by the time the VFS - * calls back to retry the aop. This has a potential to livelock as nodes - * ping locks back and forth, but that's a risk we're willing to take to avoid - * the lock inversion simply. */ int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, @@ -2449,8 +2443,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); - if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) - ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0e5b4515f92e..d63127932509 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt, } generic_fillattr(inode, stat); + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + stat->blocks += (stat->size + 511)>>9; /* We set the blksize from the cluster size for performance */ stat->blksize = osb->s_clustersize; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 3cb097ccce60..16b0bb482ea7 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -606,9 +606,7 @@ bail: if (gb_inode) mutex_unlock(&gb_inode->i_mutex); - if (gb_inode) - iput(gb_inode); - + iput(gb_inode); brelse(bh); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 13534f4fe5b5..3772a2dbb980 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) // up_write(&journal->j_trans_barrier); done: - if (inode) - iput(inode); + iput(inode); } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1687,9 +1686,7 @@ done: if (got_lock) ocfs2_inode_unlock(inode, 1); - if (inode) - iput(inode); - + iput(inode); brelse(bh); return status; @@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, ocfs2_inode_unlock(inode, 1); bail: - if (inode) - iput(inode); + iput(inode); return status; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0a4457fb0711..e9c99e35f5ea 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) brelse(alloc_bh); - if (inode) - iput(inode); + iput(inode); trace_ocfs2_load_local_alloc(osb->local_alloc_bits); @@ -473,8 +472,7 @@ out_mutex: iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); + iput(local_alloc_inode); kfree(alloc_copy); } @@ -1327,9 +1325,7 @@ bail: brelse(main_bm_bh); - if (main_bm_inode) - iput(main_bm_inode); - + iput(main_bm_inode); kfree(alloc_copy); if (ac) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index afb81eae2c18..ab42c38031b1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1683,8 +1683,7 @@ bail: if (new_inode) sync_mapping_buffers(old_inode->i_mapping); - if (new_inode) - iput(new_inode); + iput(new_inode); ocfs2_free_dir_lookup_result(&target_lookup_res); ocfs2_free_dir_lookup_result(&old_entry_lookup); @@ -2373,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, name, strlen(name)); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* find it's spot in the orphan directory */ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); @@ -2388,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, - INODE_CACHE(orphan_dir_inode), - orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index b6d51333ad02..d153e6e31529 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -82,7 +82,7 @@ struct ocfs2_quota_chunk { extern struct kmem_cache *ocfs2_dquot_cachep; extern struct kmem_cache *ocfs2_qf_chunk_cachep; -extern struct qtree_fmt_operations ocfs2_global_ops; +extern const struct qtree_fmt_operations ocfs2_global_ops; struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index c93d67220887..fde9ef18cff3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) dquot->dq_id); } -struct qtree_fmt_operations ocfs2_global_ops = { +const struct qtree_fmt_operations ocfs2_global_ops = { .mem2disk_dqblk = ocfs2_global_mem2diskdqb, .disk2mem_dqblk = ocfs2_global_disk2memdqb, .is_id = ocfs2_global_is_id, diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index e78a203d44c8..1e09592148ad 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) if (si == NULL) return; - if (si->si_inode) - iput(si->si_inode); + iput(si->si_inode); if (si->si_bh) { for (i = 0; i < si->si_blocks; i++) { if (si->si_bh[i]) { @@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb) trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); - if (status < 0) + if (status < 0) { mlog_errno(status); + /* + * if write block failed, invalidate slot to avoid overwrite + * slot during dismount in case another node rightly has mounted + */ + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + } bail: return status; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2de4c8a9340c..faa1365097bc 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb, int status, user_stack = 0; char *p; u32 tmp; + int token, option; + substring_t args[MAX_OPT_ARGS]; trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); @@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb, } while ((p = strsep(&options, ",")) != NULL) { - int token, option; - substring_t args[MAX_OPT_ARGS]; - if (!*p) continue; @@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->atime_quantum = option; break; case Opt_slot: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->slot = (s16)option; break; case Opt_commit: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->commit_interval = HZ * option; break; case Opt_localalloc: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) ocfs2_inode_unlock(inode, 0); status = 0; bail: - if (inode) - iput(inode); + iput(inode); if (status) mlog_errno(status); @@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 15e4500cda3e..b61b883c8ff8 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -443,7 +443,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d0e9b9b6223e..42305ddcbaa0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -95,7 +95,8 @@ void __init proc_init_inodecache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_PANIC), init_once); } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 9155a5a0d3b9..df4661abadc4 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) /* * Estimate the amount of memory available for userspace allocations, * without causing swapping. - * - * Free memory cannot be taken below the low watermark, before the - * system starts swapping. */ - available = i.freeram - wmark_low; + available = i.freeram - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 187b3b5f242e..a353b4c6e86e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,6 +14,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/shmem_fs.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -22,9 +23,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap, ptes, pmds; + unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any @@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; - hiwater_rss = total_rss = get_mm_rss(mm); + hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); @@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" + "RssAnon:\t%8lu kB\n" + "RssFile:\t%8lu kB\n" + "RssShmem:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" @@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), + anon << (PAGE_SHIFT-10), + file << (PAGE_SHIFT-10), + shmem << (PAGE_SHIFT-10), + mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, @@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES); + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; - *data = mm->total_vm - mm->shared_vm; + *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } @@ -451,6 +462,7 @@ struct mem_size_stats { unsigned long private_hugetlb; u64 pss; u64 swap_pss; + bool check_shmem_swap; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, @@ -485,6 +497,19 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, } } +#ifdef CONFIG_SHMEM +static int smaps_pte_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + + mss->swap += shmem_partial_swap_usage( + walk->vma->vm_file->f_mapping, addr, end); + + return 0; +} +#endif + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -512,6 +537,19 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap + && pte_none(*pte))) { + page = find_get_entry(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); + if (!page) + return; + + if (radix_tree_exceptional_entry(page)) + mss->swap += PAGE_SIZE; + else + page_cache_release(page); + + return; } if (!page) @@ -671,6 +709,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) }; memset(&mss, 0, sizeof mss); + +#ifdef CONFIG_SHMEM + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { + /* + * For shared or readonly shmem mappings we know that all + * swapped out pages belong to the shmem object, and we can + * obtain the swap value much more efficiently. For private + * writable mappings, we might have COW pages that are + * not affected by the parent swapped out pages of the shmem + * object, so we have to distinguish them during the page walk. + * Unless we know that the shmem object (or the part mapped by + * our VMA) has no swapped out pages at all. + */ + unsigned long shmem_swapped = shmem_swap_usage(vma); + + if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE)) { + mss.swap = shmem_swapped; + } else { + mss.check_shmem_swap = true; + smaps_walk.pte_hole = smaps_pte_hole; + } + } +#endif + /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); @@ -817,9 +880,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } #else diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index f37b3deb01b4..3a67cfb142d8 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -365,7 +365,7 @@ static int init_inodecache(void) qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 9728b5499e1d..47bb1de07155 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -625,7 +625,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ef0d64b2a6d9..fbd70af98820 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2924,4 +2924,4 @@ static int __init dquot_init(void) return 0; } -module_init(dquot_init); +fs_initcall(dquot_init); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index bb2869f5dfd8..d07a2f91d858 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -1,7 +1,5 @@ - #include <linux/cred.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> @@ -105,5 +103,4 @@ static int __init quota_init(void) "VFS: Failed to create quota netlink interface.\n"); return 0; }; - -module_init(quota_init); +fs_initcall(quota_init); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 2aa012a68e90..ed85d4f35c04 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); static int v2r1_is_id(void *dp, struct dquot *dquot); -static struct qtree_fmt_operations v2r0_qtree_ops = { +static const struct qtree_fmt_operations v2r0_qtree_ops = { .mem2disk_dqblk = v2r0_mem2diskdqb, .disk2mem_dqblk = v2r0_disk2memdqb, .is_id = v2r0_is_id, }; -static struct qtree_fmt_operations v2r1_qtree_ops = { +static const struct qtree_fmt_operations v2r1_qtree_ops = { .mem2disk_dqblk = v2r1_mem2diskdqb, .disk2mem_dqblk = v2r1_disk2memdqb, .is_id = v2r1_is_id, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4a62fe8cc3bf..05db7473bcb5 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -626,7 +626,8 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index bb894e78a821..6b00ca357c58 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -619,8 +619,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index dded920cbc8f..5e79bfa4f260 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -419,7 +419,8 @@ static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } diff --git a/fs/super.c b/fs/super.c index cc658a20a29e..1182af8fd5ff 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1197,7 +1197,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) else ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - WARN_ON(force_trylock & !ret); + WARN_ON(force_trylock && !ret); return ret; } EXPORT_SYMBOL(__sb_start_write); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 07ac18c355e7..d62c423a5a2d 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -346,7 +346,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fd90c079537..a233ba913be4 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2248,8 +2248,8 @@ static int __init ubifs_init(void) ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, - SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, - &inode_slab_ctor); + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 6d6a96b4e73f..e0fd65fe73e8 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb, */ int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; - struct allocExtDesc *aed; eloc.logicalBlockNum = start; elen = EXT_RECORDED_ALLOCATED | @@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb, } if (epos.offset + (2 * adsize) > sb->s_blocksize) { - unsigned char *sptr, *dptr; - int loffset; - - brelse(oepos.bh); - oepos = epos; - /* Steal a block from the extent being free'd */ - epos.block.logicalBlockNum = eloc.logicalBlockNum; + udf_setup_indirect_aext(table, eloc.logicalBlockNum, + &epos); + eloc.logicalBlockNum++; elen -= sb->s_blocksize; - - epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, &epos.block, 0)); - if (!epos.bh) { - brelse(oepos.bh); - goto error_return; - } - aed = (struct allocExtDesc *)(epos.bh->b_data); - aed->previousAllocExtLocation = - cpu_to_le32(oepos.block.logicalBlockNum); - if (epos.offset + adsize > sb->s_blocksize) { - loffset = epos.offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = iinfo->i_ext.i_data + epos.offset - - adsize; - dptr = epos.bh->b_data + - sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos.offset = sizeof(struct allocExtDesc) + - adsize; - } else { - loffset = epos.offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - if (oepos.bh) { - sptr = oepos.bh->b_data + epos.offset; - aed = (struct allocExtDesc *) - oepos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, - adsize); - } else { - sptr = iinfo->i_ext.i_data + - epos.offset; - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } - epos.offset = sizeof(struct allocExtDesc); - } - if (sbi->s_udfrev >= 0x0200) - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 3, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - else - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 2, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos.block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - lad->extLocation = - cpu_to_lelb(epos.block); - break; - } - if (oepos.bh) { - udf_update_tag(oepos.bh->b_data, loffset); - mark_buffer_dirty(oepos.bh); - } else { - mark_inode_dirty(table); - } } /* It's possible that stealing the block emptied the extent */ - if (elen) { - udf_write_aext(table, &epos, &eloc, elen, 1); - - if (!epos.bh) { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } else { - aed = (struct allocExtDesc *)epos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - udf_update_tag(epos.bh->b_data, epos.offset); - mark_buffer_dirty(epos.bh); - } - } + if (elen) + __udf_add_aext(table, &epos, &eloc, elen, 1); } brelse(epos.bh); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 055746350d16..87dc16d15572 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode, udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; - } else + } else { + struct kernel_lb_addr tmploc; + uint32_t tmplen; + udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); + /* + * We've rewritten the last extent but there may be empty + * indirect extent after it - enter it. + */ + udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); + } /* Managed to do everything necessary? */ if (!blocks) @@ -1867,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, return inode; } -int udf_add_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos) { - int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; struct allocExtDesc *aed; - uint8_t *ptr; - struct udf_inode_info *iinfo = UDF_I(inode); + struct extent_position nepos; + struct kernel_lb_addr neloc; + int ver, adsize; - if (!epos->bh) - ptr = iinfo->i_ext.i_data + epos->offset - - udf_file_entry_alloc_offset(inode) + - iinfo->i_lenEAttr; + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); else - ptr = epos->bh->b_data + epos->offset; + return -EIO; + + neloc.logicalBlockNum = block; + neloc.partitionReferenceNum = epos->block.partitionReferenceNum; + + bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + if (!bh) + return -EIO; + lock_buffer(bh); + memset(bh->b_data, 0x00, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + + aed = (struct allocExtDesc *)(bh->b_data); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { + aed->previousAllocExtLocation = + cpu_to_le32(epos->block.logicalBlockNum); + } + aed->lengthAllocDescs = cpu_to_le32(0); + if (UDF_SB(sb)->s_udfrev >= 0x0200) + ver = 3; + else + ver = 2; + udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, + sizeof(struct tag)); + + nepos.block = neloc; + nepos.offset = sizeof(struct allocExtDesc); + nepos.bh = bh; + + /* + * Do we have to copy current last extent to make space for indirect + * one? + */ + if (epos->offset + adsize > sb->s_blocksize) { + struct kernel_lb_addr cp_loc; + uint32_t cp_len; + int cp_type; + + epos->offset -= adsize; + cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0); + cp_len |= ((uint32_t)cp_type) << 30; + + __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); + udf_write_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } else { + __udf_add_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } + + brelse(epos->bh); + *epos = nepos; + + return 0; +} + +/* + * Append extent at the given position - should be the first free one in inode + * / indirect extent. This function assumes there is enough space in the inode + * or indirect extent. Use udf_add_aext() if you didn't check for this before. + */ +int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + struct allocExtDesc *aed; + int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -1891,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, else return -EIO; - if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { - unsigned char *sptr, *dptr; - struct buffer_head *nbh; - int err, loffset; - struct kernel_lb_addr obloc = epos->block; - - epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, - obloc.partitionReferenceNum, - obloc.logicalBlockNum, &err); - if (!epos->block.logicalBlockNum) - return -ENOSPC; - nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - &epos->block, - 0)); - if (!nbh) - return -EIO; - lock_buffer(nbh); - memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(nbh); - unlock_buffer(nbh); - mark_buffer_dirty_inode(nbh, inode); - - aed = (struct allocExtDesc *)(nbh->b_data); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - aed->previousAllocExtLocation = - cpu_to_le32(obloc.logicalBlockNum); - if (epos->offset + adsize > inode->i_sb->s_blocksize) { - loffset = epos->offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = ptr - adsize; - dptr = nbh->b_data + sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos->offset = sizeof(struct allocExtDesc) + adsize; - } else { - loffset = epos->offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - sptr = ptr; - epos->offset = sizeof(struct allocExtDesc); - - if (epos->bh) { - aed = (struct allocExtDesc *)epos->bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - } else { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(inode); - } - } - if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - else - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos->block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - lad->extLocation = cpu_to_lelb(epos->block); - memset(lad->impUse, 0x00, sizeof(lad->impUse)); - break; - } - if (epos->bh) { - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos->bh->b_data, loffset); - else - udf_update_tag(epos->bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos->bh, inode); - brelse(epos->bh); - } else { - mark_inode_dirty(inode); - } - epos->bh = nbh; + if (!epos->bh) { + WARN_ON(iinfo->i_lenAlloc != + epos->offset - udf_file_entry_alloc_offset(inode)); + } else { + aed = (struct allocExtDesc *)epos->bh->b_data; + WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != + epos->offset - sizeof(struct allocExtDesc)); + WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); @@ -1996,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, return 0; } +/* + * Append extent at given position - should be the first free one in inode + * / indirect extent. Takes care of allocating and linking indirect blocks. + */ +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + struct super_block *sb = inode->i_sb; + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return -EIO; + + if (epos->offset + (2 * adsize) > sb->s_blocksize) { + int err; + int new_block; + + new_block = udf_new_block(sb, NULL, + epos->block.partitionReferenceNum, + epos->block.logicalBlockNum, &err); + if (!new_block) + return -ENOSPC; + + err = udf_setup_indirect_aext(inode, new_block, epos); + if (err) + return err; + } + + return __udf_add_aext(inode, epos, eloc, elen, inc); +} + void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { @@ -2048,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos, epos->offset += adsize; } +/* + * Only 1 indirect extent in a row really makes sense but allow upto 16 in case + * someone does some weird stuff. + */ +#define UDF_MAX_INDIR_EXTS 16 + int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int8_t etype; + unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { int block; + + if (++indirections > UDF_MAX_INDIR_EXTS) { + udf_err(inode->i_sb, + "too many indirect extents in inode %lu\n", + inode->i_ino); + return -1; + } + epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); diff --git a/fs/udf/super.c b/fs/udf/super.c index 81155b9b445b..0fbb4c7c72e8 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -179,7 +179,8 @@ static int __init init_inodecache(void) udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | + SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) return -ENOMEM; @@ -1586,6 +1587,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ } /* + * Maximum number of Terminating Descriptor redirections. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_TD_NESTING 64 + +/* * Process a main/reserve volume descriptor sequence. * @block First block of first extent of the sequence. * @lastblock Lastblock of first extent of the sequence. @@ -1609,6 +1617,7 @@ static noinline int udf_process_sequence( uint16_t ident; long next_s = 0, next_e = 0; int ret; + unsigned int indirections = 0; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); @@ -1679,6 +1688,12 @@ static noinline int udf_process_sequence( } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ + if (++indirections > UDF_MAX_TD_NESTING) { + udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING); + brelse(bh); + return -EIO; + } + vds[VDS_POS_TERMINATING_DESC].block = block; if (next_e) { block = next_s; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ce169b49429d..fa0044b6b81d 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -158,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +extern int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos); +extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc); extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index ab478e62baae..e788a05aab83 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -128,11 +128,15 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) if (c < 0x80U) utf_o->u_name[utf_o->u_len++] = (uint8_t)c; else if (c < 0x800U) { + if (utf_o->u_len > (UDF_NAME_LEN - 4)) + break; utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); } else { + if (utf_o->u_len > (UDF_NAME_LEN - 5)) + break; utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); utf_o->u_name[utf_o->u_len++] = @@ -173,17 +177,22 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) { unsigned c, i, max_val, utf_char; - int utf_cnt, u_len; + int utf_cnt, u_len, u_ch; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; + u_ch = 1; try_again: u_len = 0U; utf_char = 0U; utf_cnt = 0U; for (i = 0U; i < utf->u_len; i++) { + /* Name didn't fit? */ + if (u_len + 1 + u_ch >= length) + return 0; + c = (uint8_t)utf->u_name[i]; /* Complete a multi-byte UTF-8 character */ @@ -225,6 +234,7 @@ try_again: if (max_val == 0xffU) { max_val = 0xffffU; ocu[0] = (uint8_t)0x10U; + u_ch = 2; goto try_again; } goto error_out; @@ -277,7 +287,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, c = (c << 8) | ocu[i++]; len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len], - UDF_NAME_LEN - utf_o->u_len); + UDF_NAME_LEN - 2 - utf_o->u_len); /* Valid character? */ if (len >= 0) utf_o->u_len += len; @@ -295,15 +305,19 @@ static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int len; unsigned i, max_val; uint16_t uni_char; - int u_len; + int u_len, u_ch; memset(ocu, 0, sizeof(dstring) * length); ocu[0] = 8; max_val = 0xffU; + u_ch = 1; try_again: u_len = 0U; for (i = 0U; i < uni->u_len; i++) { + /* Name didn't fit? */ + if (u_len + 1 + u_ch >= length) + return 0; len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char); if (!len) continue; @@ -316,6 +330,7 @@ try_again: if (uni_char > max_val) { max_val = 0xffffU; ocu[0] = (uint8_t)0x10U; + u_ch = 2; goto try_again; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f6390eec02ca..442fd52ebffe 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1427,7 +1427,7 @@ static int __init init_inodecache(void) ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index cc6b768fc068..d1c66e465ca5 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) #define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN #define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT #define KM_ZONE_SPREAD SLAB_MEM_SPREAD +#define KM_ZONE_ACCOUNT SLAB_ACCOUNT #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3479294c1d58..a708e38b494c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -535,6 +535,7 @@ xfs_agfl_write_verify( } const struct xfs_buf_ops xfs_agfl_buf_ops = { + .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, }; @@ -1926,7 +1927,7 @@ xfs_alloc_space_available( * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ -STATIC int /* error */ +int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ int flags) /* XFS_ALLOC_FLAG_... */ @@ -2339,6 +2340,7 @@ xfs_agf_write_verify( } const struct xfs_buf_ops xfs_agf_buf_ops = { + .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0ecde4d5cac8..135eb3d24db7 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -235,5 +235,6 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 90de071dd4c2..444626ddbd1b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -293,14 +293,7 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): @@ -311,14 +304,7 @@ xfs_allocbt_verify( return false; break; case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): @@ -332,21 +318,7 @@ xfs_allocbt_verify( return false; } - /* numrecs verification */ - if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - - return true; + return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -379,6 +351,7 @@ xfs_allocbt_write_verify( } const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f949818fa1c7..fa3b948ef9c2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -207,7 +207,7 @@ xfs_attr_set( struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, committed, local; + int error, err2, local; XFS_STATS_INC(mp, xs_attr_set); @@ -334,25 +334,15 @@ xfs_attr_set( */ xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) { - error = xfs_bmap_finish(&args.trans, args.flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args.trans, args.flist, dp); if (error) { - ASSERT(committed); args.trans = NULL; xfs_bmap_cancel(&flist); goto out; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args.trans, dp, 0); - - /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ @@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_leaf_addname(args); @@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the current trans (including the inode) and start * a new one. */ @@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int error, committed, forkoff; + int error, forkoff; trace_xfs_attr_leaf_removename(args); @@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } return 0; } @@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; xfs_mount_t *mp; - int committed, retval, error; + int retval, error; trace_xfs_attr_node_addname(args); @@ -938,27 +897,16 @@ restart: state = NULL; xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the node conversion and start the next * trans in the chain. */ @@ -977,23 +925,13 @@ restart: */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_split(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1086,25 +1024,14 @@ restart: if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_node_removename(args); @@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - /* * Commit the Btree join operation and start a new trans. */ @@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else xfs_trans_brelse(args->trans, bp); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index aa187f7ba2dd..01a5ecfedfcf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -328,6 +328,7 @@ xfs_attr3_leaf_read_verify( } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5ab95ffa4ae9..a572532a55cd 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify( } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, }; @@ -447,8 +448,6 @@ xfs_attr_rmtval_set( * Roll through the "value", allocating blocks on disk as required. */ while (blkcnt > 0) { - int committed; - /* * Allocate a single extent, up to the size of the value. * @@ -466,24 +465,14 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -614,31 +603,20 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; done = 0; while (!done) { - int committed; - xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->flist, &done); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + args->dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* * Close out trans and start the next one in the chain. */ error = xfs_trans_roll(&args->trans, args->dp); diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0e8885a59646..0a94cce5ea35 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -32,13 +32,13 @@ int xfs_bitmap_empty(uint *map, uint size) { uint i; - uint ret = 0; for (i = 0; i < size; i++) { - ret |= map[i]; + if (map[i] != 0) + return 0; } - return (ret == 0); + return 1; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 119c2422aac7..ef00156f4f96 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -325,9 +325,11 @@ xfs_check_block( /* * Check that the extents for the inode ip are in the right order in all - * btree leaves. + * btree leaves. THis becomes prohibitively expensive for large extent count + * files, so don't bother with inodes that have more than 10,000 extents in + * them. The btree record ordering checks will still be done, so for such large + * bmapbt constructs that is going to catch most corruptions. */ - STATIC void xfs_bmap_check_leaf_extents( xfs_btree_cur_t *cur, /* btree cursor or null */ @@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents( return; } + /* skip large extent count inodes */ + if (ip->i_d.di_nextents > 10000) + return; + bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -1111,7 +1117,6 @@ xfs_bmap_add_attrfork( xfs_trans_t *tp; /* transaction pointer */ int blks; /* space reservation */ int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1214,7 +1219,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1723,10 +1728,11 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; - ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + mp = bma->ip->i_mount; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1785,7 +1791,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2016,10 +2022,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, - &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); + &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2100,10 +2106,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, - &tmp_rval, XFS_DATA_FORK); + &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2169,10 +2175,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - 1, &tmp_rval, XFS_DATA_FORK); + 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2215,13 +2221,13 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - da_old > 0, &tmp_logflags, XFS_DATA_FORK); + da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) goto done; @@ -2242,7 +2248,7 @@ xfs_bmap_add_extent_delay_real( if (bma->cur) bma->cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: bma->logflags |= rval; return error; @@ -2939,7 +2945,7 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; + mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -5950,7 +5956,6 @@ xfs_bmap_split_extent( struct xfs_trans *tp; struct xfs_bmap_free free_list; xfs_fsblock_t firstfsb; - int committed; int error; tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); @@ -5971,7 +5976,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index a160f8a5a3fc..423a34e832bd 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -195,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_bmap_free *flist, struct xfs_mount *mp); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); + struct xfs_inode *ip); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6b0cf6546a82..1637c37bfbaa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -720,6 +720,7 @@ xfs_bmbt_write_verify( } const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index af1bbee5586e..a0eb18ce3ad3 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4080,3 +4080,61 @@ xfs_btree_change_owner( return 0; } + +/** + * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * btree block + * + * @bp: buffer containing the btree block + * @max_recs: pointer to the m_*_mxr max records field in the xfs mount + * @pag_max_level: pointer to the per-ag max level field + */ +bool +xfs_btree_sblock_v5hdr_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + return true; +} + +/** + * xfs_btree_sblock_verify() -- verify a short-format btree block + * + * @bp: buffer containing the btree block + * @max_recs: maximum records allowed in this btree node + */ +bool +xfs_btree_sblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 992dec0638f3..2e874be70209 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -472,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_BTREE_TRACE_ARGR(c, r) #define XFS_BTREE_TRACE_CURSOR(c, t) +bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); +bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e89a0f8f827c..097bf7717d80 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -245,6 +245,7 @@ xfs_da3_node_read_verify( } const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9c10e2b8cfcb..aa17cb788946 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -123,6 +123,7 @@ xfs_dir3_block_write_verify( } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index af71a84f343c..725fc7841fde 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -305,11 +305,13 @@ xfs_dir3_data_write_verify( } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .name = "xfs_dir3_data_reada", .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 3923e1f94697..b887fb2a2bcf 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -245,11 +245,13 @@ xfs_dir3_leafn_write_verify( } const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 70b0cb2fd556..63ee03db796c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -150,6 +150,7 @@ xfs_dir3_free_write_verify( } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5331b7f0460c..3cc3cf767474 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -54,7 +54,7 @@ xfs_dqcheck( xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ uint flags, - char *str) + const char *str) { xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; int errs = 0; @@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc( STATIC bool xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + int warn) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; xfs_dqid_t id = 0; @@ -240,8 +241,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_buf_verify"); + error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); if (error) return false; } @@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify( if (!xfs_dquot_buf_verify_crc(mp, bp)) xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp)) + else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify( } /* + * readahead errors are silent and simply leave the buffer as !done so a real + * read will then be run with the xfs_dquot_buf_ops verifier. See + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than + * reporting the failure. + */ +static void +xfs_dquot_buf_readahead_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || + !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + } +} + +/* * we don't calculate the CRC here as that is done when the dquot is flushed to * the buffer after the update is done. This ensures that the dquot in the * buffer always has an up-to-date CRC value. @@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify(mp, bp)) { + if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; @@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify( } const struct xfs_buf_ops xfs_dquot_buf_ops = { + .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, }; +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { + .name = "xfs_dquot_ra", + .verify_read = xfs_dquot_buf_readahead_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 8774498ce0ff..e2536bb1c760 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -786,7 +786,7 @@ typedef struct xfs_agfl { __be64 agfl_lsn; __be32 agfl_crc; __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; +} __attribute__((packed)) xfs_agfl_t; #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 70c1db99f6a7..66d702e6b9ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2572,6 +2572,7 @@ xfs_agi_write_verify( } const struct xfs_buf_ops xfs_agi_buf_ops = { + .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index f39b285beb19..c679f3c05b63 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -221,7 +221,6 @@ xfs_inobt_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_perag *pag = bp->b_pag; unsigned int level; /* @@ -237,14 +236,7 @@ xfs_inobt_verify( switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): case cpu_to_be32(XFS_FIBT_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): @@ -254,24 +246,12 @@ xfs_inobt_verify( return 0; } - /* numrecs and level verification */ + /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= mp->m_in_maxlevels) return false; - if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - return true; + return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); } static void @@ -304,6 +284,7 @@ xfs_inobt_write_verify( } const struct xfs_buf_ops xfs_inobt_buf_ops = { + .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 268c00f4f83a..1aabfda669b0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -62,11 +62,14 @@ xfs_inobp_check( * has not had the inode cores stamped into it. Hence for readahead, the buffer * may be potentially invalid. * - * If the readahead buffer is invalid, we don't want to mark it with an error, - * but we do want to clear the DONE status of the buffer so that a followup read - * will re-read it from disk. This will ensure that we don't get an unnecessary - * warnings during log recovery and we don't get unnecssary panics on debug - * kernels. + * If the readahead buffer is invalid, we need to mark it with an error and + * clear the DONE status of the buffer so that a followup read will re-read it + * from disk. We don't report the error otherwise to avoid warnings during log + * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * because all we want to do is say readahead failed; there is no-one to report + * the error to, so this will distinguish it from a non-ra verifier failure. + * Changes to this readahead error behavour also need to be reflected in + * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( @@ -93,6 +96,7 @@ xfs_inode_buf_verify( XFS_RANDOM_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; + xfs_buf_ioerror(bp, -EIO); return; } @@ -132,11 +136,13 @@ xfs_inode_buf_write_verify( } const struct xfs_buf_ops xfs_inode_buf_ops = { + .name = "xfs_inode", .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .name = "xxfs_inode_ra", .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 1c55ccbb379d..8e385f91d660 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -60,6 +60,7 @@ typedef struct xlog_recover { */ #define XLOG_BC_TABLE_SIZE 64 +#define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 1b0a08379759..f51078f1e92a 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, char *str); + xfs_dqid_t id, uint type, uint flags, const char *str); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a0b071d881a0..8a53eaa349f4 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -679,11 +679,13 @@ xfs_sb_write_verify( } const struct xfs_buf_ops xfs_sb_buf_ops = { + .name = "xfs_sb", .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .name = "xfs_sb_quiet", .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 5be529707903..15c3ceb845b9 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index cb6fd20a4d3d..2e2c6716b623 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -168,6 +168,7 @@ xfs_symlink_write_verify( } const struct xfs_buf_ops xfs_symlink_buf_ops = { + .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, }; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29e7e5dd5178..379c089fb051 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1917,6 +1917,7 @@ xfs_vm_readpage( struct file *unused, struct page *page) { + trace_xfs_vm_readpage(page->mapping->host, 1); return mpage_readpage(page, xfs_get_blocks); } @@ -1927,6 +1928,7 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { + trace_xfs_vm_readpages(mapping->host, nr_pages); return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index dbae6490a79a..45ec9e40150c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -91,32 +91,32 @@ xfs_zero_extent( * last due to locking considerations. We never free any extents in * the first transaction. * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. + * If an inode *ip is provided, rejoin it to the transaction if + * the transaction was committed. */ int /* error */ xfs_bmap_finish( struct xfs_trans **tp, /* transaction pointer addr */ struct xfs_bmap_free *flist, /* i/o: list extents to free */ - int *committed)/* xact committed or not */ + struct xfs_inode *ip) { struct xfs_efd_log_item *efd; /* extent free data */ struct xfs_efi_log_item *efi; /* extent free intention */ int error; /* error return value */ + int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; + if (flist->xbf_count == 0) return 0; - } + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = __xfs_trans_roll(tp, NULL, committed); + error = __xfs_trans_roll(tp, ip, &committed); if (error) { /* * If the transaction was committed, drop the EFD reference @@ -128,16 +128,13 @@ xfs_bmap_finish( * transaction so we should return committed=1 even though we're * returning an error. */ - if (*committed) { + if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, (error == -EFSCORRUPTED) ? SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); - } else { - *committed = 1; } - return error; } @@ -969,7 +966,6 @@ xfs_alloc_file_space( xfs_bmbt_irec_t imaps[1], *imapp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; trace_xfs_alloc_file_space(ip); @@ -1064,23 +1060,20 @@ xfs_alloc_file_space( error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, resblks, imapp, &nimaps, &free_list); - if (error) { + if (error) goto error0; - } /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { + if (error) break; - } allocated_fsb = imapp->br_blockcount; @@ -1206,7 +1199,6 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int committed; int done; xfs_fileoff_t endoffset_fsb; int error; @@ -1346,17 +1338,15 @@ xfs_free_file_space( error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, &done); - if (error) { + if (error) goto error0; - } /* * complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1434,7 +1424,6 @@ xfs_shift_file_space( int error; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - int committed; xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; @@ -1526,7 +1515,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ace91e7c713e..daed4bfb85b2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -604,6 +604,13 @@ found: } } + /* + * Clear b_error if this is a lookup from a caller that doesn't expect + * valid data to be found in the buffer. + */ + if (!(flags & XBF_READ)) + xfs_buf_ioerror(bp, 0); + XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; @@ -1045,7 +1052,7 @@ xfs_buf_ioend_work( xfs_buf_ioend(bp); } -void +static void xfs_buf_ioend_async( struct xfs_buf *bp) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c79b717d9b88..c75721acd867 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -132,6 +132,7 @@ struct xfs_buf_map { struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { + char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); }; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7ac6c5c586cb..9c44d38dcd1f 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -306,7 +306,7 @@ xfs_qm_dqalloc( xfs_fsblock_t firstblock; xfs_bmap_free_t flist; xfs_bmbt_irec_t map; - int nmaps, error, committed; + int nmaps, error; xfs_buf_t *bp; xfs_trans_t *tp = *tpp; @@ -379,11 +379,12 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + error = xfs_bmap_finish(tpp, &flist, NULL); + if (error) goto error1; - } - if (committed) { + /* Transaction was committed? */ + if (*tpp != tp) { tp = *tpp; xfs_trans_bjoin(tp, bp); } else { @@ -393,9 +394,9 @@ xfs_qm_dqalloc( *O_bpp = bp; return 0; - error1: +error1: xfs_bmap_cancel(&flist); - error0: +error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 74d0e5966ebc..88693a98fac5 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -164,9 +164,9 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_bn); + __return_address, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f5392ab2def1..ebe9b8290a70 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -402,19 +402,26 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - /* for dax, we need to avoid the page cache */ - if (IS_DAX(VFS_I(ip))) - ret = default_file_splice_read(infilp, ppos, pipe, count, flags); - else - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); + /* + * DAX inodes cannot ues the page cache for splice, so we have to push + * them through the VFS IO path. This means it goes through + * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we + * cannot lock the splice operation at this level for DAX inodes. + */ + if (IS_DAX(VFS_I(ip))) { + ret = default_file_splice_read(infilp, ppos, pipe, count, + flags); + goto out; + } + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); +out: + if (ret > 0) + XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); return ret; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ee393996b7d..ae3758a90ed6 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1143,7 +1143,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1226,7 +1225,7 @@ xfs_create( * pointing to itself. */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, &committed); + prid, resblks > 0, &ip, NULL); if (error) goto out_trans_cancel; @@ -1275,7 +1274,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -1427,7 +1426,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; int resblks; trace_xfs_link(tdp, target_name); @@ -1502,11 +1500,10 @@ xfs_link( * link transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } - error = xfs_bmap_finish (&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_bmap_cancel(&free_list); goto error_return; @@ -1555,7 +1552,6 @@ xfs_itruncate_extents( xfs_fileoff_t first_unmap_block; xfs_fileoff_t last_block; xfs_filblks_t unmap_len; - int committed; int error = 0; int done = 0; @@ -1601,9 +1597,7 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (committed) - xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto out_bmap_cancel; @@ -1774,7 +1768,6 @@ xfs_inactive_ifree( { xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; @@ -1841,7 +1834,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); @@ -2523,7 +2516,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; uint resblks; trace_xfs_remove(dp, name); @@ -2624,7 +2616,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -2701,7 +2693,6 @@ xfs_finish_rename( struct xfs_trans *tp, struct xfs_bmap_free *free_list) { - int committed = 0; int error; /* @@ -2711,7 +2702,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, free_list, &committed); + error = xfs_bmap_finish(&tp, free_list, NULL); if (error) { xfs_bmap_cancel(free_list); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f4f5b43cf647..d81bdc080370 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -129,7 +129,6 @@ xfs_iomap_write_direct( xfs_trans_t *tp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; @@ -203,15 +202,20 @@ xfs_iomap_write_direct( * this outside the transaction context, but if we commit and then crash * we may not have zeroed the blocks and this will be exposed on * recovery of the allocation. Hence we must zero before commit. + * * Further, if we are mapping unwritten extents here, we need to zero * and convert them to written so that we don't need an unwritten extent * callback for DAX. This also means that we need to be able to dip into - * the reserve block pool if there is no space left but we need to do - * unwritten extent conversion. + * the reserve block pool for bmbt block allocation if there is no space + * left but we need to do unwritten extent conversion. */ + if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; - tp->t_flags |= XFS_TRANS_RESERVE; + if (ISUNWRITTEN(imap)) { + tp->t_flags |= XFS_TRANS_RESERVE; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + } } error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, resrtextents); @@ -247,7 +251,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -693,7 +697,7 @@ xfs_iomap_write_allocate( xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; xfs_trans_t *tp; - int nimaps, committed; + int nimaps; int error = 0; int nres; @@ -794,7 +798,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto trans_cancel; @@ -852,7 +856,6 @@ xfs_iomap_write_unwritten( xfs_bmap_free_t free_list; xfs_fsize_t i_size; uint resblks; - int committed; int error; trace_xfs_unwritten_convert(ip, offset, count); @@ -924,7 +927,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f52c72a1a06f..9c9a1c9bcc7f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp) int aborted = 0; /* - * Race to shutdown the filesystem if we see an error. + * Race to shutdown the filesystem if we see an error or the iclog is in + * IOABORT state. The IOABORT state is only set in DEBUG mode to inject + * CRC errors into log recovery. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, - XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, + XFS_RANDOM_IODONE_IOERR) || + iclog->ic_state & XLOG_STATE_IOABORT) { + if (iclog->ic_state & XLOG_STATE_IOABORT) + iclog->ic_state &= ~XLOG_STATE_IOABORT; + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1838,6 +1844,23 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size); +#ifdef DEBUG + /* + * Intentionally corrupt the log record CRC based on the error injection + * frequency, if defined. This facilitates testing log recovery in the + * event of torn writes. Hence, set the IOABORT state to abort the log + * write on I/O completion and shutdown the fs. The subsequent mount + * detects the bad CRC and attempts to recover. + */ + if (log->l_badcrc_factor && + (prandom_u32() % log->l_badcrc_factor == 0)) { + iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_state |= XLOG_STATE_IOABORT; + xfs_warn(log->l_mp, + "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", + be64_to_cpu(iclog->ic_header.h_lsn)); + } +#endif bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; @@ -2045,12 +2068,14 @@ xlog_print_tic_res( "QM_DQCLUSTER", "QM_QINOCREATE", "QM_QUOTAOFF_END", - "SB_UNIT", "FSYNC_TS", "GROWFSRT_ALLOC", "GROWFSRT_ZERO", "GROWFSRT_FREE", - "SWAPEXT" + "SWAPEXT", + "CHECKPOINT", + "ICREATE", + "CREATE_TMPFILE" }; xfs_warn(mp, "xlog_write: reservation summary:"); @@ -2791,11 +2816,19 @@ xlog_state_do_callback( } } while (!ioerrors && loopdidcallbacks); +#ifdef DEBUG /* - * make one last gasp attempt to see if iclogs are being left in - * limbo.. + * Make one last gasp attempt to see if iclogs are being left in limbo. + * If the above loop finds an iclog earlier than the current iclog and + * in one of the syncing states, the current iclog is put into + * DO_CALLBACK and the callbacks are deferred to the completion of the + * earlier iclog. Walk the iclogs in order and make sure that no iclog + * is in DO_CALLBACK unless an earlier iclog is in one of the syncing + * states. + * + * Note that SYNCING|IOABORT is a valid state so we cannot just check + * for ic_state == SYNCING. */ -#ifdef DEBUG if (funcdidcallbacks) { first_iclog = iclog = log->l_iclog; do { @@ -2810,7 +2843,7 @@ xlog_state_do_callback( * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state & XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_DONE_SYNC || iclog->ic_state == XLOG_STATE_IOERROR ) break; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8daba7491b13..ed8896310c00 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ #define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ @@ -410,6 +411,8 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c5ecaacdd218..da37beb76f6e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -61,6 +61,9 @@ xlog_recover_check_summary( #else #define xlog_recover_check_summary(log) #endif +STATIC int +xlog_do_recovery_pass( + struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* * This structure is used during recovery to record the buf log items which @@ -868,6 +871,351 @@ validate_head: } /* + * Seek backwards in the log for log record headers. + * + * Given a starting log block, walk backwards until we find the provided number + * of records or hit the provided tail block. The return value is the number of + * records encountered or a negative error code. The log block and buffer + * pointer of the last record seen are returned in rblk and rhead respectively. + */ +STATIC int +xlog_rseek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk backwards from the head block until we hit the tail or the first + * block in the log. + */ + end_blk = head_blk > tail_blk ? tail_blk : 0; + for (i = (int) head_blk - 1; i >= end_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + + /* + * If we haven't hit the tail block or the log record header count, + * start looking again from the end of the physical log. Note that + * callers can pass head == tail if the tail is not yet known. + */ + if (tail_blk >= head_blk && found != count) { + for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Seek forward in the log for log record headers. + * + * Given head and tail blocks, walk forward from the tail block until we find + * the provided number of records or hit the head block. The return value is the + * number of records encountered or a negative error code. The log block and + * buffer pointer of the last record seen are returned in rblk and rhead + * respectively. + */ +STATIC int +xlog_seek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk forward from the tail block until we hit the head or the last + * block in the log. + */ + end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; + for (i = (int) tail_blk; i <= end_blk; i++) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + + /* + * If we haven't hit the head block or the log record header count, + * start looking again from the start of the physical log. + */ + if (tail_blk > head_blk && found != count) { + for (i = 0; i < (int) head_blk; i++) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Check the log tail for torn writes. This is required when torn writes are + * detected at the head and the head had to be walked back to a previous record. + * The tail of the previous record must now be verified to ensure the torn + * writes didn't corrupt the previous tail. + * + * Return an error if CRC verification fails as recovery cannot proceed. + */ +STATIC int +xlog_verify_tail( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + struct xlog_rec_header *thead; + struct xfs_buf *bp; + xfs_daddr_t first_bad; + int count; + int error = 0; + bool wrapped; + xfs_daddr_t tmp_head; + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + + /* + * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get + * a temporary head block that points after the last possible + * concurrently written record of the tail. + */ + count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, + XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, + &wrapped); + if (count < 0) { + error = count; + goto out; + } + + /* + * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran + * into the actual log head. tmp_head points to the start of the record + * so update it to the actual head block. + */ + if (count < XLOG_MAX_ICLOGS + 1) + tmp_head = head_blk; + + /* + * We now have a tail and temporary head block that covers at least + * XLOG_MAX_ICLOGS records from the tail. We need to verify that these + * records were completely written. Run a CRC verification pass from + * tail to head and return the result. + */ + error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Detect and trim torn writes from the head of the log. + * + * Storage without sector atomicity guarantees can result in torn writes in the + * log in the event of a crash. Our only means to detect this scenario is via + * CRC verification. While we can't always be certain that CRC verification + * failure is due to a torn write vs. an unrelated corruption, we do know that + * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at + * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of + * the log and treat failures in this range as torn writes as a matter of + * policy. In the event of CRC failure, the head is walked back to the last good + * record in the log and the tail is updated from that record and verified. + */ +STATIC int +xlog_verify_head( + struct xlog *log, + xfs_daddr_t *head_blk, /* in/out: unverified head */ + xfs_daddr_t *tail_blk, /* out: tail block */ + struct xfs_buf *bp, + xfs_daddr_t *rhead_blk, /* start blk of last record */ + struct xlog_rec_header **rhead, /* ptr to last record */ + bool *wrapped) /* last rec. wraps phys. log */ +{ + struct xlog_rec_header *tmp_rhead; + struct xfs_buf *tmp_bp; + xfs_daddr_t first_bad; + xfs_daddr_t tmp_rhead_blk; + int found; + int error; + bool tmp_wrapped; + + /* + * Search backwards through the log looking for the log record header + * block. This wraps all the way back around to the head so something is + * seriously wrong if we can't find it. + */ + found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk, + rhead, wrapped); + if (found < 0) + return found; + if (!found) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + return -EIO; + } + + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + + /* + * Now that we have a tail block, check the head of the log for torn + * writes. Search again until we hit the tail or the maximum number of + * log record I/Os that could have been in flight at one time. Use a + * temporary buffer so we don't trash the rhead/bp pointer from the + * call above. + */ + tmp_bp = xlog_get_bp(log, 1); + if (!tmp_bp) + return -ENOMEM; + error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, + XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk, + &tmp_rhead, &tmp_wrapped); + xlog_put_bp(tmp_bp); + if (error < 0) + return error; + + /* + * Now run a CRC verification pass over the records starting at the + * block found above to the current head. If a CRC failure occurs, the + * log block of the first bad record is saved in first_bad. + */ + error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + if (error == -EFSBADCRC) { + /* + * We've hit a potential torn write. Reset the error and warn + * about it. + */ + error = 0; + xfs_warn(log->l_mp, +"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.", + first_bad, *head_blk); + + /* + * Get the header block and buffer pointer for the last good + * record before the bad record. + * + * Note that xlog_find_tail() clears the blocks at the new head + * (i.e., the records with invalid CRC) if the cycle number + * matches the the current cycle. + */ + found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp, + rhead_blk, rhead, wrapped); + if (found < 0) + return found; + if (found == 0) /* XXX: right thing to do here? */ + return -EIO; + + /* + * Reset the head block to the starting block of the first bad + * log record and set the tail block based on the last good + * record. + * + * Bail out if the updated head/tail match as this indicates + * possible corruption outside of the acceptable + * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair... + */ + *head_blk = first_bad; + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + if (*head_blk == *tail_blk) { + ASSERT(0); + return 0; + } + + /* + * Now verify the tail based on the updated head. This is + * required because the torn writes trimmed from the head could + * have been written over the tail of a previous record. Return + * any errors since recovery cannot proceed if the tail is + * corrupt. + * + * XXX: This leaves a gap in truly robust protection from torn + * writes in the log. If the head is behind the tail, the tail + * pushes forward to create some space and then a crash occurs + * causing the writes into the previous record's tail region to + * tear, log recovery isn't able to recover. + * + * How likely is this to occur? If possible, can we do something + * more intelligent here? Is it safe to push the tail forward if + * we can determine that the tail is within the range of the + * torn write (e.g., the kernel can only overwrite the tail if + * it has actually been pushed forward)? Alternatively, could we + * somehow prevent this condition at runtime? + */ + error = xlog_verify_tail(log, *head_blk, *tail_blk); + } + + return error; +} + +/* * Find the sync block number or the tail of the log. * * This will be the block number of the last record to have its @@ -893,13 +1241,13 @@ xlog_find_tail( xlog_op_header_t *op_head; char *offset = NULL; xfs_buf_t *bp; - int error, i, found; + int error; xfs_daddr_t umount_data_blk; xfs_daddr_t after_umount_blk; + xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; int hblks; - - found = 0; + bool wrapped = false; /* * Find previous log record @@ -923,48 +1271,16 @@ xlog_find_tail( } /* - * Search backwards looking for log record header block + * Trim the head block back to skip over torn records. We can have + * multiple log I/Os in flight at any time, so we assume CRC failures + * back through the previous several records are torn writes and skip + * them. */ ASSERT(*head_blk < INT_MAX); - for (i = (int)(*head_blk) - 1; i >= 0; i--) { - error = xlog_bread(log, i, 1, bp, &offset); - if (error) - goto done; - - if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 1; - break; - } - } - /* - * If we haven't found the log record header block, start looking - * again from the end of the physical log. XXXmiken: There should be - * a check here to make sure we didn't search more than N blocks in - * the previous code. - */ - if (!found) { - for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { - error = xlog_bread(log, i, 1, bp, &offset); - if (error) - goto done; - - if (*(__be32 *)offset == - cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 2; - break; - } - } - } - if (!found) { - xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - xlog_put_bp(bp); - ASSERT(0); - return -EIO; - } - - /* find blk_no of tail of log */ - rhead = (xlog_rec_header_t *)offset; - *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk, + &rhead, &wrapped); + if (error) + goto done; /* * Reset log values according to the state of the log when we @@ -976,10 +1292,10 @@ xlog_find_tail( * written was complete and ended exactly on the end boundary * of the physical log. */ - log->l_prev_block = i; + log->l_prev_block = rhead_blk; log->l_curr_block = (int)*head_blk; log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); - if (found == 2) + if (wrapped) log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); @@ -1014,12 +1330,13 @@ xlog_find_tail( } else { hblks = 1; } - after_umount_blk = (i + hblks + (int) - BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); + after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = (i + hblks) % log->l_logBBsize; + umount_data_blk = rhead_blk + hblks; + umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) goto done; @@ -3204,6 +3521,7 @@ xlog_recover_dquot_ra_pass2( struct xfs_disk_dquot *recddq; struct xfs_dq_logformat *dq_f; uint type; + int len; if (mp->m_qflags == 0) @@ -3224,8 +3542,12 @@ xlog_recover_dquot_ra_pass2( ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); + len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); + if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, + &xfs_dquot_buf_ra_ops); } STATIC void @@ -4118,26 +4440,69 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } +STATIC int +xlog_unpack_data( + struct xlog_rec_header *rhead, + char *dp, + struct xlog *log) +{ + int i, j, k; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + return 0; +} + /* - * Upack the log buffer data and crc check it. If the check fails, issue a - * warning if and only if the CRC in the header is non-zero. This makes the - * check an advisory warning, and the zero CRC check will prevent failure - * warnings from being emitted when upgrading the kernel from one that does not - * add CRCs by default. - * - * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log - * corruption failure + * CRC check, unpack and process a log record. */ STATIC int -xlog_unpack_data_crc( +xlog_recover_process( + struct xlog *log, + struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - struct xlog *log) + int pass) { + int error; __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); - if (crc != rhead->h_crc) { + + /* + * Nothing else to do if this is a CRC verification pass. Just return + * if this a record with a non-zero crc. Unfortunately, mkfs always + * sets h_crc to 0 so we must consider this valid even on v5 supers. + * Otherwise, return EFSBADCRC on failure so the callers up the stack + * know precisely what failed. + */ + if (pass == XLOG_RECOVER_CRCPASS) { + if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc)) + return -EFSBADCRC; + return 0; + } + + /* + * We're in the normal recovery path. Issue a warning if and only if the + * CRC in the header is non-zero. This is an advisory warning and the + * zero CRC check prevents warnings from being emitted when upgrading + * the kernel from one that does not add CRCs by default. + */ + if (crc != le32_to_cpu(rhead->h_crc)) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", @@ -4147,47 +4512,18 @@ xlog_unpack_data_crc( } /* - * If we've detected a log record corruption, then we can't - * recover past this point. Abort recovery if we are enforcing - * CRC protection by punting an error back up the stack. + * If the filesystem is CRC enabled, this mismatch becomes a + * fatal log corruption failure. */ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) return -EFSCORRUPTED; } - return 0; -} - -STATIC int -xlog_unpack_data( - struct xlog_rec_header *rhead, - char *dp, - struct xlog *log) -{ - int i, j, k; - int error; - - error = xlog_unpack_data_crc(rhead, dp, log); + error = xlog_unpack_data(rhead, dp, log); if (error) return error; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } - - return 0; + return xlog_recover_process_data(log, rhash, rhead, dp, pass); } STATIC int @@ -4239,18 +4575,21 @@ xlog_do_recovery_pass( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, - int pass) + int pass, + xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; + xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; - int error = 0, h_size; + int error = 0, h_size, h_len; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); + rhead_blk = 0; /* * Read the header of the tail block and get the iclog buffer size from @@ -4274,7 +4613,31 @@ xlog_do_recovery_pass( error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; + + /* + * xfsprogs has a bug where record length is based on lsunit but + * h_size (iclog size) is hardcoded to 32k. Now that we + * unconditionally CRC verify the unmount record, this means the + * log buffer can be too small for the record and cause an + * overrun. + * + * Detect this condition here. Use lsunit for the buffer size as + * long as this looks like the mkfs case. Otherwise, return an + * error to avoid a buffer overrun. + */ h_size = be32_to_cpu(rhead->h_size); + h_len = be32_to_cpu(rhead->h_len); + if (h_len > h_size) { + if (h_len <= log->l_mp->m_logbsize && + be32_to_cpu(rhead->h_num_logops) == 1) { + xfs_warn(log->l_mp, + "invalid iclog size (%d bytes), using lsunit (%d bytes)", + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; + } else + return -EFSCORRUPTED; + } + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; @@ -4301,7 +4664,7 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = tail_blk; + blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -4408,19 +4771,18 @@ xlog_do_recovery_pass( goto bread_err2; } - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, + pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks; + rhead_blk = blk_no; } ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; + rhead_blk = blk_no; } /* read first part of physical log */ @@ -4441,21 +4803,22 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks + hblks; + rhead_blk = blk_no; } bread_err2: xlog_put_bp(dbp); bread_err1: xlog_put_bp(hbp); + + if (error && first_bad) + *first_bad = rhead_blk; + return error; } @@ -4493,7 +4856,7 @@ xlog_do_log_recovery( INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS1); + XLOG_RECOVER_PASS1, NULL); if (error != 0) { kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; @@ -4504,7 +4867,7 @@ xlog_do_log_recovery( * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS2); + XLOG_RECOVER_PASS2, NULL); #ifdef DEBUG if (!error) { int i; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ab1bac6a3a1c..be02a68b2fe2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -766,7 +766,6 @@ xfs_growfs_rt_alloc( { xfs_fileoff_t bno; /* block number in file */ struct xfs_buf *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ xfs_daddr_t d; /* disk block address */ int error; /* error return value */ xfs_fsblock_t firstblock;/* first block allocated in xaction */ @@ -811,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 36bd8825bfb0..59c9b7bd958d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -137,7 +137,7 @@ static const match_table_t tokens = { }; -STATIC unsigned long +STATIC int suffix_kstrtoint(char *s, unsigned int base, int *res) { int last, shift_left_factor = 0, _res; @@ -1714,8 +1714,8 @@ xfs_init_zones(void) xfs_inode_zone = kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, - xfs_fs_inode_init_once); + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | + KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 996481eeb491..b44284c1adda 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; int nmaps; @@ -387,7 +386,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt( struct xfs_inode *ip) { xfs_buf_t *bp; - int committed; int done; int error; xfs_fsblock_t first_block; @@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto error_bmap_cancel; /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* * The first xact was committed, so add the inode to the new one. * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ee70f5dec9dc..641d625eb334 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -255,11 +255,47 @@ write_grant_head_show( } XFS_SYSFS_ATTR_RO(write_grant_head); +#ifdef DEBUG +STATIC ssize_t +log_badcrc_factor_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xlog *log = to_xlog(kobject); + int ret; + uint32_t val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + log->l_badcrc_factor = val; + + return count; +} + +STATIC ssize_t +log_badcrc_factor_show( + struct kobject *kobject, + char *buf) +{ + struct xlog *log = to_xlog(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor); +} + +XFS_SYSFS_ATTR_RW(log_badcrc_factor); +#endif /* DEBUG */ + static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head), ATTR_LIST(write_grant_head), +#ifdef DEBUG + ATTR_LIST(log_badcrc_factor), +#endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 877079eb0f8f..391d797cb53f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1222,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); +DECLARE_EVENT_CLASS(xfs_readpage_class, + TP_PROTO(struct inode *inode, int nr_pages), + TP_ARGS(inode, nr_pages), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, nr_pages) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_pages = nr_pages; + ), + TP_printk("dev %d:%d ino 0x%llx nr_pages %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->nr_pages) +) + +#define DEFINE_READPAGE_EVENT(name) \ +DEFINE_EVENT(xfs_readpage_class, name, \ + TP_PROTO(struct inode *inode, int nr_pages), \ + TP_ARGS(inode, nr_pages)) +DEFINE_READPAGE_EVENT(xfs_vm_readpage); +DEFINE_READPAGE_EVENT(xfs_vm_readpages); + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int type, struct xfs_bmbt_irec *irec), diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ce78534a047e..995170194df0 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -572,12 +572,16 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - /* no warnings for project quotas - we just return ENOSPC later */ + enum quota_type qtype; + if (dqp->dq_flags & XFS_DQ_PROJ) - return; - quota_send_warning(make_kqid(&init_user_ns, - (dqp->dq_flags & XFS_DQ_USER) ? - USRQUOTA : GRPQUOTA, + qtype = PRJQUOTA; + else if (dqp->dq_flags & XFS_DQ_USER) + qtype = USRQUOTA; + else + qtype = GRPQUOTA; + + quota_send_warning(make_kqid(&init_user_ns, qtype, be32_to_cpu(dqp->q_core.d_id)), mp->m_super->s_dev, type); } |