diff options
Diffstat (limited to 'fs')
66 files changed, 820 insertions, 460 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c547cca26a26..51d2e4de34eb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -696,6 +696,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } +#ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, struct page *newpage, struct page *page) { @@ -712,12 +713,9 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; -#ifdef CONFIG_MIGRATION return migrate_page(mapping, newpage, page); -#else - return -ENOSYS; -#endif } +#endif static int btree_writepage(struct page *page, struct writeback_control *wbc) { @@ -1009,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - BUG_ON(!root->node); + if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { + free_extent_buffer(root->node); + return -EIO; + } root->commit_root = btrfs_root_node(root); return 0; } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 6f0444473594..659f532d26a0 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; - static struct dentry *dentry; + struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bcd59c7dfb57..227e5815d838 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -429,6 +429,7 @@ err: static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_trans_handle *trans, + struct btrfs_root *root, int load_cache_only) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, /* * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. + * to have the normal tree locking. Also if we are currently trying to + * allocate blocks for the tree root we can't do the fast caching since + * we likely hold important locks. */ - if (!trans->transaction->in_commit) { + if (!trans->transaction->in_commit && + (root && root != root->fs_info->tree_root)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); @@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_root *root = block_group->fs_info->tree_root; struct inode *inode = NULL; u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; int num_pages = 0; int retries = 0; int ret = 0; @@ -2795,6 +2800,8 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED) { + /* We're not cached, don't bother trying to write stuff out */ + dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; } @@ -2821,6 +2828,8 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + if (!ret) + dcs = BTRFS_DC_SETUP; btrfs_free_reserved_data_space(inode, num_pages); out_put: iput(inode); @@ -2828,10 +2837,7 @@ out_free: btrfs_release_path(root, path); out: spin_lock(&block_group->lock); - if (ret) - block_group->disk_cache_state = BTRFS_DC_ERROR; - else - block_group->disk_cache_state = BTRFS_DC_SETUP; + block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); return ret; @@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + u64 num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); @@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, 1); + cache_block_group(cache, trans, NULL, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4930,11 +4942,31 @@ search: btrfs_get_block_group(block_group); search_start = block_group->key.objectid; + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, data)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((data & extra) && !(block_group->flags & extra)) + goto loop; + } + have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { u64 free_percent; - ret = cache_block_group(block_group, trans, 1); + ret = cache_block_group(block_group, trans, + orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) goto have_block_group; @@ -4958,7 +4990,8 @@ have_block_group: if (loop > LOOP_CACHING_NOWAIT || (loop > LOOP_FIND_IDEAL && atomic_read(&space_info->caching_threads) < 2)) { - ret = cache_block_group(block_group, trans, 0); + ret = cache_block_group(block_group, trans, + orig_root, 0); BUG_ON(ret); } found_uncached_bg = true; @@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, 0); + cache_block_group(block_group, trans, NULL, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, NULL, NULL); BUG_ON(ret < 0); if (ret > 0) { - ret = btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); - BUG_ON(ret); + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); } } @@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; - leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); cache = kzalloc(sizeof(*cache), GFP_NOFS); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c1faded5fca0..66836d85763b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page **prepared_pages, struct iov_iter *i) { - size_t copied; + size_t copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); + int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; -again: - if (unlikely(iov_iter_fault_in_readable(i, count))) - return -EFAULT; - - /* Copy data from userspace to the current page */ - copied = iov_iter_copy_from_user(page, i, offset, count); + /* + * Copy data from userspace to the current page + * + * Disable pagefault to avoid recursive lock since + * the pages are already locked + */ + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); + pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); iov_iter_advance(i, copied); write_bytes -= copied; + total_copied += copied; + /* Return to btrfs_file_aio_write to fault page */ if (unlikely(copied == 0)) { - count = min_t(size_t, PAGE_CACHE_SIZE - offset, - iov_iter_single_seg_count(i)); - goto again; + break; } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { @@ -81,7 +85,7 @@ again: offset = 0; } } - return 0; + return total_copied; } /* @@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, unsigned long last_index; int will_write; int buffered = 0; + int copied = 0; + int dirty_pages = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); @@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_delalloc_reserve_space(inode, write_bytes); + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { + ret = -EFAULT; + goto out; + } + + ret = btrfs_delalloc_reserve_space(inode, + num_pages << PAGE_CACHE_SHIFT); if (ret) goto out; @@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_delalloc_release_space(inode, write_bytes); + btrfs_delalloc_release_space(inode, + num_pages << PAGE_CACHE_SHIFT); goto out; } - ret = btrfs_copy_from_user(pos, num_pages, + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - if (ret == 0) { + dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (num_pages > dirty_pages) { + if (copied > 0) + atomic_inc( + &BTRFS_I(inode)->outstanding_extents); + btrfs_delalloc_release_space(inode, + (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT); + } + + if (copied > 0) { dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); + dirty_pages, pos, copied); } btrfs_drop_pages(pages, num_pages); - if (ret) { - btrfs_delalloc_release_space(inode, write_bytes); - goto out; - } - if (will_write) { - filemap_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1); - } else { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - num_pages); - if (num_pages < - (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); + if (copied > 0) { + if (will_write) { + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + copied - 1); + } else { + balance_dirty_pages_ratelimited_nr( + inode->i_mapping, + dirty_pages); + if (dirty_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } } - pos += write_bytes; - num_written += write_bytes; + pos += copied; + num_written += copied; cond_resched(); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 22ee0dc2e6b8..60d684266959 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation, (unsigned long long)block_group->key.objectid); - goto out; + goto free_cache; } if (!num_entries) @@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, return 0; } + node = rb_first(&block_group->free_space_offset); + if (!node) { + iput(inode); + return 0; + } + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & @@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, */ first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - node = rb_first(&block_group->free_space_offset); - if (!node) - goto out_free; - /* * Lock all pages first so we can lock the extent safely. * diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8039390bd6a6..72f31ecb5c90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -495,7 +495,7 @@ again: add_async_extent(async_cow, start, num_bytes, total_compressed, pages, nr_pages_ret); - if (start + num_bytes < end && start + num_bytes < actual_end) { + if (start + num_bytes < end) { start += num_bytes; pages = NULL; cond_resched(); @@ -5712,9 +5712,9 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (err) { printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " - "disk_bytenr %lu len %u err no %d\n", - dip->inode->i_ino, bio->bi_rw, bio->bi_sector, - bio->bi_size, err); + "sector %#Lx len %u err no %d\n", + dip->inode->i_ino, bio->bi_rw, + (unsigned long long)bio->bi_sector, bio->bi_size, err); dip->errors = 1; /* @@ -5934,8 +5934,7 @@ free_ordered: */ if (write) { struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, - dip->logical_offset); + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) btrfs_free_reserved_extent(root, ordered->start, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f1c9bb4079ed..f87552a1d7ea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -947,23 +947,42 @@ out: static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg, int subvol, - int async) + int v2) { struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; char *name; u64 fd; - u64 transid = 0; int ret; - if (async) { - async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); - if (IS_ERR(async_vol_args)) - return PTR_ERR(async_vol_args); + if (v2) { + u64 transid = 0; + u64 *ptr = NULL; - name = async_vol_args->name; - fd = async_vol_args->fd; - async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; + vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); + if (IS_ERR(vol_args_v2)) + return PTR_ERR(vol_args_v2); + + if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { + ret = -EINVAL; + goto out; + } + + name = vol_args_v2->name; + fd = vol_args_v2->fd; + vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + + if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, ptr); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; } else { vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) @@ -971,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, name = vol_args->name; fd = vol_args->fd; vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - } - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, &transid); - if (!ret && async) { - if (copy_to_user(arg + - offsetof(struct btrfs_ioctl_async_vol_args, - transid), &transid, sizeof(transid))) - return -EFAULT; + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, NULL); } - +out: kfree(vol_args); - kfree(async_vol_args); + kfree(vol_args_v2); return ret; } @@ -2246,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0, 0); - case BTRFS_IOC_SNAP_CREATE_ASYNC: + case BTRFS_IOC_SNAP_CREATE_V2: return btrfs_ioctl_snap_create(file, argp, 0, 1); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1, 0); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 17c99ebdf960..c344d12c646b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; -#define BTRFS_SNAPSHOT_NAME_MAX 4079 -struct btrfs_ioctl_async_vol_args { +#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) + +#define BTRFS_SUBVOL_NAME_MAX 4039 +struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; - char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; + __u64 flags; + __u64 unused[4]; + char name[BTRFS_SUBVOL_NAME_MAX + 1]; }; #define BTRFS_INO_LOOKUP_PATH_MAX 4080 @@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_args) #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ - struct btrfs_ioctl_async_vol_args) +#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_vol_args_v2) #endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 79cba5fbc28e..f8be250963a0 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret < 0) goto out; + if (ret) { + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dbb51ea7a13c..883c6fa1367e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -685,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, mutex_unlock(&root->d_inode->i_mutex); if (IS_ERR(new_root)) { + dput(root); deactivate_locked_super(s); error = PTR_ERR(new_root); - dput(root); goto error_free_subvol_name; } if (!new_root->d_inode) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cc04dc1445d6..6b9884507837 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; - } else if (strcmp(device->name, path)) { + } else if (!device->name || strcmp(device->name, path)) { name = kstrdup(path, GFP_NOFS); if (!name) return -ENOMEM; kfree(device->name); device->name = name; + if (device->missing) { + fs_devices->missing_devices--; + device->missing = 0; + } } if (found_transid > fs_devices->latest_trans) { @@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->num_devices--; + if (device->missing) + root->fs_info->fs_devices->missing_devices--; + next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); if (device->bdev == root->fs_info->sb->s_bdev) @@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, device->devid = devid; device->work.func = pending_bios_fn; device->fs_devices = fs_devices; + device->missing = 1; fs_devices->num_devices++; + fs_devices->missing_devices++; spin_lock_init(&device->io_lock); INIT_LIST_HEAD(&device->dev_alloc_list); memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); @@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root, device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; + } else if (!device->missing) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + root->fs_info->fs_devices->missing_devices++; + device->missing = 1; } } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2b638b6e4eea..2740db49eb04 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -44,6 +44,7 @@ struct btrfs_device { int writeable; int in_fs_metadata; + int missing; spinlock_t io_lock; @@ -93,6 +94,7 @@ struct btrfs_fs_devices { u64 num_devices; u64 open_devices; u64 rw_devices; + u64 missing_devices; u64 total_rw_bytes; struct block_device *latest_bdev; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 7d447af84ec4..d902948a90d8 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,7 +40,8 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) dentry->d_op = &ceph_dentry_ops; else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) dentry->d_op = &ceph_snapdir_dentry_ops; @@ -114,8 +115,8 @@ static int __dcache_readdir(struct file *filp, spin_lock(&dcache_lock); /* start at beginning? */ - if (filp->f_pos == 2 || (last && - filp->f_pos < ceph_dentry(last)->offset)) { + if (filp->f_pos == 2 || last == NULL || + filp->f_pos < ceph_dentry(last)->offset) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8d79b8912e31..7d0e4a82d898 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -282,7 +282,8 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool align_to_pages) + int *checkeof, bool align_to_pages, + unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); @@ -307,7 +308,7 @@ static int striped_read(struct inode *inode, more: if (align_to_pages) - page_align = (pos - io_align) & ~PAGE_MASK; + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; this_len = left; @@ -376,16 +377,18 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, struct inode *inode = file->f_dentry->d_inode; struct page **pages; u64 off = *poff; - int num_pages = calc_pages_for(off, len); - int ret; + int num_pages, ret; dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_DIRECT) - pages = ceph_get_direct_page_vector(data, num_pages); - else + if (file->f_flags & O_DIRECT) { + num_pages = calc_pages_for((unsigned long)data, len); + pages = ceph_get_direct_page_vector(data, num_pages, true); + } else { + num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + } if (IS_ERR(pages)) return PTR_ERR(pages); @@ -400,7 +403,8 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, goto done; ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT); + file->f_flags & O_DIRECT, + (unsigned long)data & ~PAGE_MASK); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -409,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, done: if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, true); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -456,6 +460,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int do_sync = 0; int check_caps = 0; int page_align, io_align; + unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -471,6 +476,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, pos = *offset; io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) @@ -496,12 +502,15 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; - if (file->f_flags & O_DIRECT) + if (file->f_flags & O_DIRECT) { /* write from beginning of first page, regardless of io alignment */ - page_align = (pos - io_align) & ~PAGE_MASK; - else + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + num_pages = calc_pages_for((unsigned long)data, len); + } else { page_align = pos & ~PAGE_MASK; + num_pages = calc_pages_for(pos, len); + } req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, @@ -512,10 +521,8 @@ more: if (!req) return -ENOMEM; - num_pages = calc_pages_for(pos, len); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages); + pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -565,7 +572,7 @@ more: } if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, false); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index a6ce54e94eb5..52e8fd74d450 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,7 +4,7 @@ #include <linux/ioctl.h> #include <linux/types.h> -#define CEPH_IOCTL_MAGIC 0x98 +#define CEPH_IOCTL_MAGIC 0x97 /* just use u64 to align sanely on all archs */ struct ceph_ioctl_layout { diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 40abde93c345..476b329867d4 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -11,40 +11,68 @@ * Implement fcntl and flock locking functions. */ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, - u64 pid, u64 pid_ns, - int cmd, u64 start, u64 length, u8 wait) + int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; + u64 length = 0; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = igrab(inode); + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " "length: %llu, wait: %d, type`: %d", (int)lock_type, - (int)operation, pid, start, length, wait, cmd); + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type); + req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; - req->r_args.filelock_change.pid = cpu_to_le64(pid); + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); /* This should be adjusted, but I'm not sure if namespaces actually get id numbers*/ req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)pid_ns); - req->r_args.filelock_change.start = cpu_to_le64(start); + cpu_to_le64((u64)(unsigned long)fl->fl_nspid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); + + if ( operation == CEPH_MDS_OP_GETFILELOCK){ + fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, - (int)operation, pid, start, length, wait, cmd, err); + "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); return err; } @@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u64 length; u8 lock_cmd; int err; u8 wait = 0; @@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; - if (LLONG_MAX == fl->fl_end) - length = 0; - else - length = fl->fl_end - fl->fl_start + 1; - - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - lock_cmd, fl->fl_start, - length, wait); + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - dout("mds locked, locking locally"); - err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - CEPH_LOCK_UNLOCK, fl->fl_start, - length, 0); - dout("got %d on posix_lock_file, undid lock", err); + if ( op != CEPH_MDS_OP_GETFILELOCK ){ + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if the kernel detects + * local deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock", err); + } } + } else { dout("mds returned error code %d", err); } @@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { - u64 length; u8 lock_cmd; int err; u8 wait = 1; @@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - /* mds requires start and length rather than start and end */ - if (LLONG_MAX == fl->fl_end) - length = 0; - else - length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - lock_cmd, fl->fl_start, - length, wait); + file, lock_cmd, wait, fl); if (!err) { err = flock_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - CEPH_LOCK_UNLOCK, fl->fl_start, - length, 0); + file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } } else { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 098b18508479..38800eaa81d0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -202,6 +202,38 @@ out_bad: } /* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + *p += sizeof(*info->filelock_reply); + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info); + else + return parse_reply_info_dir(p, end, info); +} + +/* * parse entire mds reply */ static int parse_reply_info(struct ceph_msg *msg, @@ -223,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg, goto out_bad; } - /* dir content */ + /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_dir(&p, p+len, info); + err = parse_reply_info_extra(&p, p+len, info); if (err < 0) goto out_bad; } @@ -2074,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&session->s_mutex); if (err < 0) { - pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); ceph_msg_dump(msg); goto out_err; } @@ -2094,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && rinfo->dir_nr) + if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 9341fd4f1432..aabe563b54db 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in { }; /* - * parsed info about an mds reply, including information about the - * target inode and/or its parent directory and dentry, and directory - * contents (for readdir results). + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; + /* trace */ struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; - struct ceph_mds_reply_dirfrag *dir_dir; - int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + }; + }; /* encoded blob describing snapshot contexts for certain operations (e.g., open) */ diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index adefa60a9bdc..43b19dd39191 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o cifsacl.o + readdir.o ioctl.o sess.o export.o + +cifs-$(CONFIG_CIFS_ACL) += cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/README b/fs/cifs/README index ee68d1036544..46af99ab3614 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -337,6 +337,15 @@ A partial list of the supported mount options follows: wsize default write size (default 57344) maximum wsize currently allowed by CIFS is 57344 (fourteen 4096 byte pages) + actimeo=n attribute cache timeout in seconds (default 1 second). + After this timeout, the cifs client requests fresh attribute + information from the server. This option allows to tune the + attribute cache timeout to suit the workload needs. Shorter + timeouts mean better the cache coherency, but increased number + of calls to the server. Longer timeouts mean reduced number + of calls to the server at the expense of less stricter cache + coherency checks (i.e. incorrect attribute cache for a short + period of time). rw mount the network share read-write (note that the server may still consider the share read-only) ro mount network share read-only diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index e9a393c9c2ca..7852cd677051 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -48,6 +48,7 @@ struct cifs_sb_info { struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ atomic_t active; uid_t mnt_uid; gid_t mnt_gid; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index c6ebea088ac7..a437ec391a01 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -30,8 +30,6 @@ #include "cifs_debug.h" -#ifdef CONFIG_CIFS_EXPERIMENTAL - static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, @@ -774,4 +772,3 @@ int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) return rc; } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 6c8096cf5155..c4ae7d036563 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -74,11 +74,7 @@ struct cifs_wksid { char sidname[SIDNAMELENGTH]; } __attribute__((packed)); -#ifdef CONFIG_CIFS_EXPERIMENTAL - extern int match_sid(struct cifs_sid *); extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); -#endif /* CONFIG_CIFS_EXPERIMENTAL */ - #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 76c8a906a63e..3936aa7f2c22 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -463,6 +463,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); + /* convert actimeo and display it in seconds */ + seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); return 0; } @@ -935,7 +937,6 @@ init_cifs(void) GlobalCurrentXid = 0; GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; - memset(Local_System_Name, 0, 15); spin_lock_init(&cifs_tcp_ses_lock); spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b577bf0a1bb3..7136c0c3e2f9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -45,6 +45,16 @@ #define CIFS_MIN_RCV_POOL 4 /* + * default attribute cache timeout (jiffies) + */ +#define CIFS_DEF_ACTIMEO (1 * HZ) + +/* + * max attribute cache timeout (jiffies) - 2^30 + */ +#define CIFS_MAX_ACTIMEO (1 << 30) + +/* * MAX_REQ is the maximum number of requests that WE will send * on one socket concurrently. It also matches the most common * value of max multiplex returned by servers. We may @@ -746,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ /* on midQ entries */ -GLOBAL_EXTERN char Local_System_Name[15]; - /* * Global counters, updated atomically */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index db961dc4fd3d..e6d1481b16c1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -54,7 +54,8 @@ do { \ __func__, curr_xid, (int)rc); \ } while (0) extern char *build_path_from_dentry(struct dentry *); -extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); +extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, @@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); -#ifdef CONFIG_CIFS_EXPERIMENTAL extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); -#endif extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 2f2632b6df5a..67acfb3acad2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2478,95 +2478,6 @@ querySymLinkRetry: } #ifdef CONFIG_CIFS_EXPERIMENTAL -/* Initialize NT TRANSACT SMB into small smb request buffer. - This assumes that all NT TRANSACTS that we init here have - total parm and data under about 400 bytes (to fit in small cifs - buffer size), which is the case so far, it easily fits. NB: - Setup words themselves and ByteCount - MaxSetupCount (size of returned setup area) and - MaxParameterCount (returned parms size) must be set by caller */ -static int -smb_init_nttransact(const __u16 sub_command, const int setup_count, - const int parm_len, struct cifsTconInfo *tcon, - void **ret_buf) -{ - int rc; - __u32 temp_offset; - struct smb_com_ntransact_req *pSMB; - - rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, - (void **)&pSMB); - if (rc) - return rc; - *ret_buf = (void *)pSMB; - pSMB->Reserved = 0; - pSMB->TotalParameterCount = cpu_to_le32(parm_len); - pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->DataCount = pSMB->TotalDataCount; - temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + - (setup_count * 2) - 4 /* for rfc1001 length itself */; - pSMB->ParameterOffset = cpu_to_le32(temp_offset); - pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); - pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ - pSMB->SubCommand = cpu_to_le16(sub_command); - return 0; -} - -static int -validate_ntransact(char *buf, char **ppparm, char **ppdata, - __u32 *pparmlen, __u32 *pdatalen) -{ - char *end_of_smb; - __u32 data_count, data_offset, parm_count, parm_offset; - struct smb_com_ntransact_rsp *pSMBr; - - *pdatalen = 0; - *pparmlen = 0; - - if (buf == NULL) - return -EINVAL; - - pSMBr = (struct smb_com_ntransact_rsp *)buf; - - /* ByteCount was converted from little endian in SendReceive */ - end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + - (char *)&pSMBr->ByteCount; - - data_offset = le32_to_cpu(pSMBr->DataOffset); - data_count = le32_to_cpu(pSMBr->DataCount); - parm_offset = le32_to_cpu(pSMBr->ParameterOffset); - parm_count = le32_to_cpu(pSMBr->ParameterCount); - - *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; - *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; - - /* should we also check that parm and data areas do not overlap? */ - if (*ppparm > end_of_smb) { - cFYI(1, "parms start after end of smb"); - return -EINVAL; - } else if (parm_count + *ppparm > end_of_smb) { - cFYI(1, "parm end after end of smb"); - return -EINVAL; - } else if (*ppdata > end_of_smb) { - cFYI(1, "data starts after end of smb"); - return -EINVAL; - } else if (data_count + *ppdata > end_of_smb) { - cFYI(1, "data %p + count %d (%p) past smb end %p start %p", - *ppdata, data_count, (data_count + *ppdata), - end_of_smb, pSMBr); - return -EINVAL; - } else if (parm_count + data_count > pSMBr->ByteCount) { - cFYI(1, "parm count and data count larger than SMB"); - return -EINVAL; - } - *pdatalen = data_count; - *pparmlen = parm_count; - return 0; -} - int CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, @@ -3056,7 +2967,97 @@ GetExtAttrOut: #endif /* CONFIG_POSIX */ -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL +/* + * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that + * all NT TRANSACTS that we init here have total parm and data under about 400 + * bytes (to fit in small cifs buffer size), which is the case so far, it + * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of + * returned setup area) and MaxParameterCount (returned parms size) must be set + * by caller + */ +static int +smb_init_nttransact(const __u16 sub_command, const int setup_count, + const int parm_len, struct cifsTconInfo *tcon, + void **ret_buf) +{ + int rc; + __u32 temp_offset; + struct smb_com_ntransact_req *pSMB; + + rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, + (void **)&pSMB); + if (rc) + return rc; + *ret_buf = (void *)pSMB; + pSMB->Reserved = 0; + pSMB->TotalParameterCount = cpu_to_le32(parm_len); + pSMB->TotalDataCount = 0; + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->DataCount = pSMB->TotalDataCount; + temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + + (setup_count * 2) - 4 /* for rfc1001 length itself */; + pSMB->ParameterOffset = cpu_to_le32(temp_offset); + pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); + pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ + pSMB->SubCommand = cpu_to_le16(sub_command); + return 0; +} + +static int +validate_ntransact(char *buf, char **ppparm, char **ppdata, + __u32 *pparmlen, __u32 *pdatalen) +{ + char *end_of_smb; + __u32 data_count, data_offset, parm_count, parm_offset; + struct smb_com_ntransact_rsp *pSMBr; + + *pdatalen = 0; + *pparmlen = 0; + + if (buf == NULL) + return -EINVAL; + + pSMBr = (struct smb_com_ntransact_rsp *)buf; + + /* ByteCount was converted from little endian in SendReceive */ + end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + + (char *)&pSMBr->ByteCount; + + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + parm_offset = le32_to_cpu(pSMBr->ParameterOffset); + parm_count = le32_to_cpu(pSMBr->ParameterCount); + + *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; + *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; + + /* should we also check that parm and data areas do not overlap? */ + if (*ppparm > end_of_smb) { + cFYI(1, "parms start after end of smb"); + return -EINVAL; + } else if (parm_count + *ppparm > end_of_smb) { + cFYI(1, "parm end after end of smb"); + return -EINVAL; + } else if (*ppdata > end_of_smb) { + cFYI(1, "data starts after end of smb"); + return -EINVAL; + } else if (data_count + *ppdata > end_of_smb) { + cFYI(1, "data %p + count %d (%p) past smb end %p start %p", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); + return -EINVAL; + } else if (parm_count + data_count > pSMBr->ByteCount) { + cFYI(1, "parm count and data count larger than SMB"); + return -EINVAL; + } + *pdatalen = data_count; + *pparmlen = parm_count; + return 0; +} + /* Get Security Descriptor (by handle) from remote server for a file or dir */ int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, @@ -3214,7 +3215,7 @@ setCifsAclRetry: return (rc); } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ +#endif /* CONFIG_CIFS_ACL */ /* Legacy Query Path Information call for lookup to old servers such as Win9x/WinME */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 32fa4d9b5dbc..cc1a8604a790 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -105,6 +105,7 @@ struct smb_vol { unsigned int wsize; bool sockopt_tcp_nodelay:1; unsigned short int port; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ char *prepath; struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; @@ -806,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname, short int override_gid = -1; bool uid_specified = false; bool gid_specified = false; + char *nodename = utsname()->nodename; separator[0] = ','; separator[1] = 0; - if (Local_System_Name[0] != 0) - memcpy(vol->source_rfc1001_name, Local_System_Name, 15); - else { - char *nodename = utsname()->nodename; - int n = strnlen(nodename, 15); - memset(vol->source_rfc1001_name, 0x20, 15); - for (i = 0; i < n; i++) { - /* does not have to be perfect mapping since field is - informational, only used for servers that do not support - port 445 and it can be overridden at mount time */ - vol->source_rfc1001_name[i] = toupper(nodename[i]); - } - } + /* + * does not have to be perfect mapping since field is + * informational, only used for servers that do not support + * port 445 and it can be overridden at mount time + */ + memset(vol->source_rfc1001_name, 0x20, 15); + for (i = 0; i < strnlen(nodename, 15); i++) + vol->source_rfc1001_name[i] = toupper(nodename[i]); + vol->source_rfc1001_name[15] = 0; /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ @@ -840,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* default to using server inode numbers where available */ vol->server_ino = 1; + vol->actimeo = CIFS_DEF_ACTIMEO; + if (!options) return 1; @@ -1214,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: server net" "biosname longer than 15 truncated.\n"); } + } else if (strnicmp(data, "actimeo", 7) == 0) { + if (value && *value) { + vol->actimeo = HZ * simple_strtoul(value, + &value, 0); + if (vol->actimeo > CIFS_MAX_ACTIMEO) { + cERROR(1, "CIFS: attribute cache" + "timeout too large"); + return 1; + } + } } else if (strnicmp(data, "credentials", 4) == 0) { /* ignore */ } else if (strnicmp(data, "version", 3) == 0) { @@ -2571,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, cFYI(1, "file mode: 0x%x dir mode: 0x%x", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + cifs_sb->actimeo = pvolume_info->actimeo; + if (pvolume_info->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) @@ -2821,13 +2833,13 @@ remote_path_check: /* check if a whole path (including prepath) is not remote */ if (!rc && cifs_sb->prepathlen && tcon) { /* build_path_to_root works only when we have a valid tcon */ - full_path = cifs_build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) { rc = -ENOMEM; goto mount_fail_check; } rc = is_path_accessible(xid, tcon, cifs_sb, full_path); - if (rc != -EREMOTE) { + if (rc != 0 && rc != -EREMOTE) { kfree(full_path); goto mount_fail_check; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b857ce5db775..5a28660ca2b5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1108,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, return total_written; } -#ifdef CONFIG_CIFS_EXPERIMENTAL struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { @@ -1142,7 +1141,6 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, spin_unlock(&cifs_file_list_lock); return NULL; } -#endif struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 28cb6e735943..589f3e3f6e00 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -686,7 +686,7 @@ int cifs_get_inode_info(struct inode **pinode, cFYI(1, "cifs_sfu_type failed: %d", tmprc); } -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, @@ -697,7 +697,7 @@ int cifs_get_inode_info(struct inode **pinode, goto cgii_exit; } } -#endif +#endif /* CONFIG_CIFS_ACL */ /* fill in remaining high mode bits e.g. SUID, VTX */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) @@ -728,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; -char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) +char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon) { int pplen = cifs_sb->prepathlen; int dfsplen; char *full_path = NULL; - struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); /* if no prefix path, simply set path to the root of share to "" */ if (pplen == 0) { @@ -875,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) char *full_path; struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); - full_path = cifs_build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) return ERR_PTR(-ENOMEM); @@ -1653,6 +1653,7 @@ static bool cifs_inode_needs_reval(struct inode *inode) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); if (cifs_i->clientCanCacheRead) return false; @@ -1663,12 +1664,12 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; - /* FIXME: the actimeo should be tunable */ - if (time_after_eq(jiffies, cifs_i->time + HZ)) + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->actimeo)) return true; /* hardlinked files w/ noserverino get "special" treatment */ - if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && S_ISREG(inode->i_mode) && inode->i_nlink != 1) return true; @@ -2121,7 +2122,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { rc = 0; -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { rc = mode_to_cifs_acl(inode, full_path, mode); if (rc) { @@ -2130,7 +2131,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; } } else -#endif +#endif /* CONFIG_CIFS_ACL */ if (((mode & S_IWUGO) == 0) && (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 32d300e8f20e..a73eb9f4bdaf 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -759,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, ino, fattr.cf_dtype); - /* - * we can not return filldir errors to the caller since they are - * "normal" when the stat blocksize is too small - we return remapped - * error instead - * - * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above - * case already. Why should we be clobbering other errors from it? - */ - if (rc) { - cFYI(1, "filldir rc = %d", rc); - rc = -EOVERFLOW; - } dput(tmp_dentry); return rc; } diff --git a/fs/exec.c b/fs/exec.c index d68c378a3137..c62efcb959c7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -275,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (err) + goto err; + err = insert_vm_struct(mm, vma); if (err) goto err; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6a5edea2d70b..94ce3d7a1c4b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -910,6 +910,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bdbe69902207..e659597b690b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2125,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, */ if (unlikely(journal_data && PageChecked(page))) err = __ext4_journalled_writepage(page, len); - else + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) err = ext4_bio_write_page(&io_submit, page, len, mpd->wbc); + else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); if (!err) mpd->pages_written++; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 92203b8a099f..dc40e75cba88 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; if ((namelen <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '0')) { + (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index dc963929de65..981c8477adab 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -232,6 +232,8 @@ static int setup_new_group_blocks(struct super_block *sb, GFP_NOFS); if (err) goto exit_bh; + for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) + ext4_set_bit(bit, bh->b_data); ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -247,6 +249,9 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; + for (i = 0, bit = input->inode_table - start; + i < sbi->s_itb_per_group; i++, bit++) + ext4_set_bit(bit, bh->b_data); if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e32195d6aac3..fb15c9c0be74 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1026,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); + if (test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",mblk_io_submit"); if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -1239,8 +1241,8 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_block_validity, Opt_noblock_validity, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, @@ -1304,6 +1306,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_mblk_io_submit, "mblk_io_submit"}, + {Opt_nomblk_io_submit, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1725,6 +1729,12 @@ set_qf_format: case Opt_nodelalloc: clear_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_mblk_io_submit: + set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + break; + case Opt_nomblk_io_submit: + clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + break; case Opt_stripe: if (match_int(&args[0], &option)) return 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9242d294fe90..8b984a2cebbd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/compat.h> static const struct file_operations fuse_direct_io_file_operations; @@ -1628,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, } /* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +/* * For ioctls, there is no generic way to determine how much memory * needs to be read and/or written. Furthermore, ioctls are allowed * to dereference the passed pointer, so the parameter requires deep @@ -1808,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - err = -EIO; - if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) - goto out; - - /* okay, copy in iovs and retry */ vaddr = kmap_atomic(pages[0], KM_USER0); - memcpy(page_address(iov_page), vaddr, transferred); + err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr, KM_USER0); + if (err) + goto out; in_iov = page_address(iov_page); out_iov = in_iov + in_iovs; + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + goto retry; } diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index f46ee8b0e135..9da29706f91c 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) super->s_journal_seg[i] = segno; super->s_journal_ec[i] = ec; logfs_set_segment_reserved(sb, segno); - err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); BUG_ON(err); /* mempool should prevent this */ err = logfs_erase_segment(sb, segno, 1); BUG_ON(err); /* FIXME: remount-ro would be nicer */ diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 6127baf0e188..ee99a9f5dfd3 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode) /* FIXME: transaction is part of logfs_block now. Is that enough? */ err = logfs_write_buf(master_inode, page, 0); + if (err) + move_page_to_inode(inode, page); + logfs_put_write_page(page); return err; } diff --git a/fs/namei.c b/fs/namei.c index 5362af9b7372..4ff7ca530533 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname, if (!(open_flag & O_CREAT)) mode = 0; + /* Must never be set by userspace */ + open_flag &= ~FMODE_NONOTIFY; + /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f0a384e2ae63..996dd8989a91 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static int nfs_readdir_clear_array(struct page*, gfp_t); +static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -83,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = { .setattr = nfs_setattr, }; -const struct address_space_operations nfs_dir_addr_space_ops = { - .releasepage = nfs_readdir_clear_array, +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, }; #ifdef CONFIG_NFS_V3 @@ -178,6 +178,7 @@ typedef struct { struct page *page; unsigned long page_index; u64 *dir_cookie; + u64 last_cookie; loff_t current_index; decode_dirent_t decode; @@ -213,17 +214,15 @@ void nfs_readdir_release_array(struct page *page) * we are freeing strings created by nfs_add_to_readdir_array() */ static -int nfs_readdir_clear_array(struct page *page, gfp_t mask) +void nfs_readdir_clear_array(struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array; int i; - if (IS_ERR(array)) - return PTR_ERR(array); + array = kmap_atomic(page, KM_USER0); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); - nfs_readdir_release_array(page); - return 0; + kunmap_atomic(array, KM_USER0); } /* @@ -272,7 +271,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) goto out; array->last_cookie = entry->cookie; array->size++; - if (entry->eof == 1) + if (entry->eof != 0) array->eof_index = array->size; out: nfs_readdir_release_array(page); @@ -312,15 +311,14 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { desc->cache_entry_index = i; - status = 0; - goto out; + return 0; } } - if (i == array->eof_index) { - desc->eof = 1; + if (array->eof_index >= 0) { status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; } -out: return status; } @@ -328,10 +326,7 @@ static int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) { struct nfs_cache_array *array; - int status = -EBADCOOKIE; - - if (desc->dir_cookie == NULL) - goto out; + int status; array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -344,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) else status = nfs_readdir_search_for_cookie(array, desc); + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->page_index++; + } nfs_readdir_release_array(desc->page); out: return status; @@ -490,7 +489,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; - if (desc->plus == 1) + if (desc->plus != 0) nfs_prime_dcache(desc->file->f_path.dentry, entry); status = nfs_readdir_add_to_array(entry, page); @@ -498,7 +497,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); - if (count == 0 || (status == -EBADCOOKIE && entry->eof == 1)) { + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { array->eof_index = array->size; @@ -563,7 +562,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; - entry.cookie = *desc->dir_cookie; + entry.cookie = desc->last_cookie; entry.eof = 0; entry.fh = nfs_alloc_fhandle(); entry.fattr = nfs_alloc_fattr(); @@ -636,6 +635,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); page_cache_release(desc->page); desc->page = NULL; } @@ -660,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) return PTR_ERR(desc->page); res = nfs_readdir_search_array(desc); - if (res == 0) - return 0; - cache_page_release(desc); + if (res != 0) + cache_page_release(desc); return res; } @@ -672,22 +672,16 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { int res; - if (desc->page_index == 0) + if (desc->page_index == 0) { desc->current_index = 0; - while (1) { - res = find_cache_page(desc); - if (res != -EAGAIN) - break; - desc->page_index++; + desc->last_cookie = 0; } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); return res; } -static inline unsigned int dt_type(struct inode *inode) -{ - return (inode->i_mode >> 12) & 15; -} - /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -717,13 +711,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, break; } file->f_pos++; - desc->cache_entry_index = i; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; } - if (i == array->eof_index) + if (array->eof_index >= 0) desc->eof = 1; nfs_readdir_release_array(desc->page); @@ -764,6 +757,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, } desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; desc->page = page; status = nfs_readdir_xdr_to_array(desc, page, inode); @@ -791,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - int res = -ENOMEM; + int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -816,7 +810,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res < 0) goto out; - while (desc->eof != 1) { + do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { @@ -844,7 +838,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) res = nfs_do_filldir(desc, dirent, filldir); if (res < 0) break; - } + } while (!desc->eof); out: nfs_unblock_sillyrename(dentry); if (res > 0) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 60677f9f1311..7bf029ef4084 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; + unsigned int saved_type = fl->fl_type; /* Try local locking first */ posix_test_lock(filp, fl); @@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) /* found a conflict */ goto out; } + fl->fl_type = saved_type; if (nfs_have_delegation(inode, FMODE_READ)) goto out_noconflict; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 314f57164602..e67e31c73416 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; + inode->i_data.a_ops = &nfs_dir_aops; if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index eceafe74f473..4f981f1f6689 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = { static struct rpc_version mnt_version1 = { .number = 1, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt_procedures), .procs = mnt_procedures, }; static struct rpc_version mnt_version3 = { .number = 3, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt3_procedures), .procs = mnt3_procedures, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6a653ffd8e4e..4435e5e1f904 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3361,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) ret = nfs_revalidate_inode(server, inode); if (ret < 0) return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) return ret; @@ -3389,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + /* + * Acl update can result in inode attribute update. + * so mark the attribute cache invalid. + */ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); return ret; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 137b549e63db..b68536cc9046 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req) { if (!nfs_lock_request_dontget(req)) return 0; - if (req->wb_page != NULL) + if (test_bit(PG_MAPPED, &req->wb_flags)) radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); return 1; } @@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req) */ void nfs_clear_page_tag_locked(struct nfs_page *req) { - if (req->wb_page != NULL) { + if (test_bit(PG_MAPPED, &req->wb_flags)) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e4b62c6f5a6e..aedcaa7f291f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req) (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - nfs_clear_request(req); nfs_release_request(req); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3c045044fca2..4100630c9a5b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1069,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_VER3; mnt->version = 3; break; -#ifdef CONFIG_NFS_V4 case Opt_v4: mnt->flags &= ~NFS_MOUNT_VER3; mnt->version = 4; break; -#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1286,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_VER3; mnt->version = 3; break; -#ifdef CONFIG_NFS_V4 case NFS4_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; mnt->version = 4; break; -#endif default: goto out_invalid_value; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4c14c17a5276..10d648ea128b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) if (nfs_have_delegation(inode, FMODE_WRITE)) nfsi->change_attr++; } + set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; @@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_lock(&inode->i_lock); set_page_private(req->wb_page, 0); ClearPagePrivate(req->wb_page); + clear_bit(PG_MAPPED, &req->wb_flags); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; if (!nfsi->npages) { @@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) iput(inode); } else spin_unlock(&inode->i_lock); - nfs_clear_request(req); nfs_release_request(req); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2a533a0af2a9..7e84a852cdae 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &fhp->fh_post_attr); fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; - if (err) + if (err) { fhp->fh_post_saved = 0; - else + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + } else fhp->fh_post_saved = 1; } diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4d476ff08ae6..60fce3dc5cb5 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { - BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); - cinfo->atomic = 1; + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); - if (cinfo->change_supported) { - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - } else { - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 33ad25ddd5c4..caf9a6a3fb54 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) int nilfs_init_gcinode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); @@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode) ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); - /* - * Add the inode to GC inode list. Garbage Collection - * is serialized and no two processes manipulate the - * list simultaneously. - */ - igrab(inode); - list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes); - return 0; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index e00d9457c256..b185e937a335 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) { size_t nmembs = argv->v_nmembs; + struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; struct inode *inode; struct nilfs_vdesc *vdesc; struct buffer_head *bh, *n; @@ -353,6 +354,17 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, ret = PTR_ERR(inode); goto failed; } + if (list_empty(&NILFS_I(inode)->i_dirty)) { + /* + * Add the inode to GC inode list. Garbage Collection + * is serialized and no two processes manipulate the + * list simultaneously. + */ + igrab(inode); + list_add(&NILFS_I(inode)->i_dirty, + &nilfs->ns_gc_inodes); + } + do { ret = nilfs_ioctl_move_inode_block(inode, vdesc, &buffers); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b04f88eed09e..f35794b97e8e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response); + wait_event(group->fanotify_data.access_waitq, event->response || + atomic_read(&group->fanotify_data.bypass_perm)); + + if (!event->response) /* bypass_perm set */ + return 0; /* userspace responded, convert to something usable */ spin_lock(&event->lock); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 063224812b7e..8b61220cffc5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) return client_fd; } -static ssize_t fill_event_metadata(struct fsnotify_group *group, +static int fill_event_metadata(struct fsnotify_group *group, struct fanotify_event_metadata *metadata, struct fsnotify_event *event) { + int ret = 0; + pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, group, metadata, event); metadata->event_len = FAN_EVENT_METADATA_LEN; + metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - metadata->fd = create_fd(group, event); + if (unlikely(event->mask & FAN_Q_OVERFLOW)) + metadata->fd = FAN_NOFD; + else { + metadata->fd = create_fd(group, event); + if (metadata->fd < 0) + ret = metadata->fd; + } - return metadata->fd; + return ret; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS @@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, mutex_lock(&group->fanotify_data.access_mutex); - if (group->fanotify_data.bypass_perm) { + if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); event->response = FAN_ALLOW; @@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - fd = fill_event_metadata(group, &fanotify_event_metadata, event); - if (fd < 0) - return fd; + ret = fill_event_metadata(group, &fanotify_event_metadata, event); + if (ret < 0) + goto out; + fd = fanotify_event_metadata.fd; ret = prepare_for_access_response(group, event, fd); if (ret) goto out_close_fd; ret = -EFAULT; - if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN)) + if (copy_to_user(buf, &fanotify_event_metadata, + fanotify_event_metadata.event_len)) goto out_kill_access_response; - return FAN_EVENT_METADATA_LEN; + return fanotify_event_metadata.event_len; out_kill_access_response: remove_access_response(group, event, fd); out_close_fd: - sys_close(fd); + if (fd != FAN_NOFD) + sys_close(fd); +out: +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) { + event->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + } +#endif return ret; } @@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) mutex_lock(&group->fanotify_data.access_mutex); - group->fanotify_data.bypass_perm = true; + atomic_inc(&group->fanotify_data.bypass_perm); list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, @@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) return -ENOSPC; @@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, fsnotify_init_mark(fsn_mark, fanotify_free_mark); ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) { - fanotify_free_mark(fsn_mark); - return ret; - } + if (ret) + goto err; } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - fsnotify_put_mark(fsn_mark); + if (added & ~mnt->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); - - return 0; +err: + fsnotify_put_mark(fsn_mark); + return ret; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) return -ENOSPC; @@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, fsnotify_init_mark(fsn_mark, fanotify_free_mark); ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) { - fanotify_free_mark(fsn_mark); - return ret; - } + if (ret) + goto err; } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - fsnotify_put_mark(fsn_mark); + if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); - return 0; +err: + fsnotify_put_mark(fsn_mark); + return ret; } /* fanotify syscalls */ @@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_group(&fanotify_fsnotify_ops); - if (IS_ERR(group)) + if (IS_ERR(group)) { + free_uid(user); return PTR_ERR(group); + } group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); @@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: @@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, if (flags & ~FAN_ALL_MARK_FLAGS) return -EINVAL; switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { - case FAN_MARK_ADD: + case FAN_MARK_ADD: /* fallthrough */ case FAN_MARK_REMOVE: + if (!mask) + return -EINVAL; case FAN_MARK_FLUSH: break; default: diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 444c305a468c..4cd5d5d78f9f 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (ret >= 0) return ret; + fsnotify_put_group(group); atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f1e962cb3b73..0d7c5540ad66 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + if (ocfs2_iocb_is_sem_locked(iocb)) { + up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } + ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); - if (!level) - up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); if (is_async) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 76bfdfda691a..eceb456037c1 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) else clear_bit(1, (unsigned long *)&iocb->private); } + +/* + * Using a named enum representing lock types in terms of #N bit stored in + * iocb->private, which is going to be used for communication bewteen + * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + */ +enum ocfs2_iocb_lock_bits { + OCFS2_IOCB_RW_LOCK = 0, + OCFS2_IOCB_RW_LOCK_LEVEL, + OCFS2_IOCB_SEM, + OCFS2_IOCB_NUM_LOCKS +}; + #define ocfs2_iocb_clear_rw_locked(iocb) \ - clear_bit(0, (unsigned long *)&iocb->private) + clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ - test_bit(1, (unsigned long *)&iocb->private) + test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_sem_locked(iocb) \ + set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_sem_locked(iocb) \ + clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_sem_locked(iocb) \ + test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index c7fba396392d..6c61771469af 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(QUOTA), define_mask(REFCOUNT), define_mask(BASTS), + define_mask(RESERVATIONS), + define_mask(CLUSTER), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), - define_mask(RESERVATIONS), }; static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index ea2ed9f56c94..34d6544357d9 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -81,7 +81,7 @@ #include <linux/sched.h> /* bits that are frequently given and infrequently matched in the low word */ -/* NOTE: If you add a flag, you need to also update mlog.c! */ +/* NOTE: If you add a flag, you need to also update masklog.c! */ #define ML_ENTRY 0x0000000000000001ULL /* func call entry */ #define ML_EXIT 0x0000000000000002ULL /* func call exit */ #define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ @@ -114,13 +114,14 @@ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ #define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ -#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */ +#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */ +#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */ +#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */ + /* bits that are infrequently given and frequently matched in the high word */ -#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ -#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ -#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ -#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ -#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */ +#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ +#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ +#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c49f6de0e7ab..d417b3f9b0c7 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, di->i_dx_root = cpu_to_le64(dr_blkno); + spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_journal_dirty(handle, di_bh); @@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, goto out_commit; } + spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f564b0e5f80d..59f0f6bdfc62 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) */ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, - int *numlocks) + int *numlocks, + int *hasrefs) { int ret; int i; @@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); + *numlocks = 0; + *hasrefs = 0; + ret = -EINVAL; if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "cannot migrate lockres with unknown owner!\n"); @@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, } *numlocks = count; - mlog(0, "migrateable lockres having %d locks\n", *numlocks); + + count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (count < O2NM_MAX_NODES) + *hasrefs = 1; + + mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, + res->lockname.len, res->lockname.name, *numlocks, *hasrefs); leave: return ret; @@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, const char *name; unsigned int namelen; int mle_added = 0; - int numlocks; + int numlocks, hasrefs; int wake = 0; if (!dlm_grab(dlm)) @@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, name = res->lockname.name; namelen = res->lockname.len; - mlog(0, "migrating %.*s to %u\n", namelen, name, target); + mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); /* * ensure this lockres is a proper candidate for migration */ spin_lock(&res->spinlock); - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); if (ret < 0) { spin_unlock(&res->spinlock); goto leave; @@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_unlock(&res->spinlock); /* no work to do */ - if (numlocks == 0) { - mlog(0, "no locks were found on this lockres! done!\n"); + if (numlocks == 0 && !hasrefs) goto leave; - } /* * preallocate up front @@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * find a node to migrate the lockres to */ - mlog(0, "picking a migration node\n"); spin_lock(&dlm->spinlock); /* pick a new node */ if (!test_bit(target, dlm->domain_map) || target >= O2NM_MAX_NODES) { target = dlm_pick_migration_target(dlm, res); } - mlog(0, "node %u chosen for migration\n", target); + mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name, + namelen, name, target); if (target >= O2NM_MAX_NODES || !test_bit(target, dlm->domain_map)) { @@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int ret; int lock_dropped = 0; - int numlocks; + int numlocks, hasrefs; spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { @@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) } /* No need to migrate a lockres having no locks */ - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); - if (ret >= 0 && numlocks == 0) { + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); + if (ret >= 0 && numlocks == 0 && !hasrefs) { spin_unlock(&res->spinlock); goto leave; } @@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, } queue++; } + + nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (nodenum < O2NM_MAX_NODES) { + spin_unlock(&res->spinlock); + return nodenum; + } spin_unlock(&res->spinlock); mlog(0, "have not found a suitable target yet! checking domain map\n"); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 77b4c04a2809..f6cba566429d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2241,11 +2241,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, mutex_lock(&inode->i_mutex); + ocfs2_iocb_clear_sem_locked(iocb); + relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (direct_io) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_sem_locked(iocb); } /* @@ -2382,8 +2386,10 @@ out: ocfs2_rw_unlock(inode, rw_level); out_sems: - if (have_alloc_sem) + if (have_alloc_sem) { up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } mutex_unlock(&inode->i_mutex); @@ -2527,6 +2533,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } + ocfs2_iocb_clear_sem_locked(iocb); + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. @@ -2534,6 +2542,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, if (filp->f_flags & O_DIRECT) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); if (ret < 0) { @@ -2575,8 +2584,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } bail: - if (have_alloc_sem) + if (have_alloc_sem) { up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); mlog_exit(ret); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c2e4f8222e2f..bf2e7764920e 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -350,7 +350,7 @@ enum { #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; -#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE #define NUM_LOCAL_SYSTEM_INODES \ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index d2af0a8381a6..77a59891734e 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -297,6 +297,7 @@ xfs_rename( * it and some incremental backup programs won't work without it. */ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); /* * Adjust the link count on src_dp. This is necessary when |