diff options
Diffstat (limited to 'fs')
123 files changed, 2066 insertions, 1220 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3abc447783aa..7bf835f85bc8 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -161,7 +161,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if ((fl->fl_flags & FL_POSIX) != FL_POSIX) BUG(); - res = posix_lock_file_wait(filp, fl); + res = locks_lock_file_wait(filp, fl); if (res < 0) goto out; @@ -231,7 +231,8 @@ out_unlock: if (res < 0 && fl->fl_type != F_UNLCK) { fl_type = fl->fl_type; fl->fl_type = F_UNLCK; - res = posix_lock_file_wait(filp, fl); + /* Even if this fails we want to return the remote error */ + locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } out: diff --git a/fs/block_dev.c b/fs/block_dev.c index 073bb57adab1..0a793c7930eb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1075,7 +1075,7 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - + blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ecbc63d3143e..9a2ec79e8cfb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1828,7 +1828,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, int found = 0; struct extent_buffer *eb; struct btrfs_inode_extref *extref; - struct extent_buffer *leaf; u32 item_size; u32 cur_offset; unsigned long ptr; @@ -1856,9 +1855,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, slot); - ptr = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; while (cur_offset < item_size) { @@ -1872,7 +1870,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, if (ret) break; - cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } btrfs_tree_read_unlock_blocking(eb); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 81220b2203c6..0ef5cc13fae2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,8 +44,6 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 -/* DIO is ready to submit */ -#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0d98aee34fee..1e60d00d4ea7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2847,6 +2847,8 @@ int open_ctree(struct super_block *sb, !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + if (!IS_ERR(chunk_root->node)) + free_extent_buffer(chunk_root->node); chunk_root->node = NULL; goto fail_tree_roots; } @@ -2885,6 +2887,8 @@ retry_root_backup: !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); + if (!IS_ERR(tree_root->node)) + free_extent_buffer(tree_root->node); tree_root->node = NULL; goto recovery_tree_root; } @@ -3765,9 +3769,7 @@ void close_ctree(struct btrfs_root *root) * block groups queued for removal, the deletion will be * skipped when we quit the cleaner thread. */ - mutex_lock(&root->fs_info->cleaner_mutex); btrfs_delete_unused_bgs(root->fs_info); - mutex_unlock(&root->fs_info->cleaner_mutex); ret = btrfs_commit_super(root); if (ret) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 8d052209f473..2513a7f53334 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -112,11 +112,11 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, u32 generation; if (fh_type == FILEID_BTRFS_WITH_PARENT) { - if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + if (fh_len < BTRFS_FID_SIZE_CONNECTABLE) return NULL; root_objectid = fid->root_objectid; } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { - if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) return NULL; root_objectid = fid->parent_root_objectid; } else @@ -136,11 +136,11 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, u32 generation; if ((fh_type != FILEID_BTRFS_WITH_PARENT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + fh_len < BTRFS_FID_SIZE_CONNECTABLE) && (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) && (fh_type != FILEID_BTRFS_WITHOUT_PARENT || - fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE)) return NULL; objectid = fid->objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5411f0ab5683..601d7d45d164 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2828,6 +2828,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2844,6 +2845,7 @@ again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif + trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { btrfs_abort_transaction(trans, root, ret); @@ -2893,6 +2895,7 @@ again: } out: assert_qgroups_uptodate(trans); + trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -3742,10 +3745,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - if (total_bytes > 0) - found->full = 0; - else - found->full = 1; + found->full = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -4309,7 +4309,8 @@ out: * the block groups that were made dirty during the lifetime of the * transaction. */ - if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + if (trans->can_flush_pending_bgs && + trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -8668,7 +8669,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { - btrfs_drop_and_free_fs_root(tree_root->fs_info, root); + btrfs_add_dropped_root(trans, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); @@ -9563,7 +9564,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; + trans->can_flush_pending_bgs = false; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { if (ret) goto next; @@ -9584,6 +9587,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, next: list_del_init(&block_group->bg_list); } + trans->can_flush_pending_bgs = can_flush_pending_bgs; } int btrfs_make_block_group(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f1018cfbfefa..3915c9473e94 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2798,7 +2798,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_end_io_t end_io_func, int mirror_num, unsigned long prev_bio_flags, - unsigned long bio_flags) + unsigned long bio_flags, + bool force_bio_submit) { int ret = 0; struct bio *bio; @@ -2814,6 +2815,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || + force_bio_submit || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, @@ -2910,7 +2912,8 @@ static int __do_readpage(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2958,6 +2961,7 @@ static int __do_readpage(struct extent_io_tree *tree, } while (cur <= end) { unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + bool force_bio_submit = false; if (cur >= last_byte) { char *userpage; @@ -3008,6 +3012,49 @@ static int __do_readpage(struct extent_io_tree *tree, block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; + + /* + * If we have a file range that points to a compressed extent + * and it's followed by a consecutive file range that points to + * to the same compressed extent (possibly with a different + * offset and/or length, so it either points to the whole extent + * or only part of it), we must make sure we do not submit a + * single bio to populate the pages for the 2 ranges because + * this makes the compressed extent read zero out the pages + * belonging to the 2nd range. Imagine the following scenario: + * + * File layout + * [0 - 8K] [8K - 24K] + * | | + * | | + * points to extent X, points to extent X, + * offset 4K, length of 8K offset 0, length 16K + * + * [extent X, compressed length = 4K uncompressed length = 16K] + * + * If the bio to read the compressed extent covers both ranges, + * it will decompress extent X into the pages belonging to the + * first range and then it will stop, zeroing out the remaining + * pages that belong to the other range that points to extent X. + * So here we make sure we submit 2 bios, one for the first + * range and another one for the third range. Both will target + * the same physical extent from disk, but we can't currently + * make the compressed bio endio callback populate the pages + * for both ranges because each compressed bio is tightly + * coupled with a single extent map, and each range can have + * an extent map with a different offset value relative to the + * uncompressed data of our extent and different lengths. This + * is a corner case so we prioritize correctness over + * non-optimal behavior (submitting 2 bios for the same extent). + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + prev_em_start && *prev_em_start != (u64)-1 && + *prev_em_start != em->orig_start) + force_bio_submit = true; + + if (prev_em_start) + *prev_em_start = em->orig_start; + free_extent_map(em); em = NULL; @@ -3057,7 +3104,8 @@ static int __do_readpage(struct extent_io_tree *tree, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, - this_bio_flag); + this_bio_flag, + force_bio_submit); if (!ret) { nr++; *bio_flags = this_bio_flag; @@ -3084,7 +3132,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { struct inode *inode; struct btrfs_ordered_extent *ordered; @@ -3104,7 +3153,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw); + mirror_num, bio_flags, rw, prev_em_start); page_cache_release(pages[index]); } } @@ -3114,7 +3163,8 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { u64 start = 0; u64 end = 0; @@ -3135,7 +3185,7 @@ static void __extent_readpages(struct extent_io_tree *tree, index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw); + rw, prev_em_start); start = page_start; end = start + PAGE_CACHE_SIZE - 1; first_index = index; @@ -3146,7 +3196,8 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw); + mirror_num, bio_flags, rw, + prev_em_start); } static int __extent_read_full_page(struct extent_io_tree *tree, @@ -3172,7 +3223,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw); + bio_flags, rw, NULL); return ret; } @@ -3198,7 +3249,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, int ret; ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, READ, NULL); if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; @@ -3451,7 +3502,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, - 0, 0, 0); + 0, 0, 0, false); if (ret) SetPageError(page); } @@ -3754,7 +3805,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, - 0, epd->bio_flags, bio_flags); + 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { set_btree_ioerr(p); @@ -4158,6 +4209,7 @@ int extent_readpages(struct extent_io_tree *tree, struct page *page; struct extent_map *em_cached = NULL; int nr = 0; + u64 prev_em_start = (u64)-1; for (page_idx = 0; page_idx < nr_pages; page_idx++) { page = list_entry(pages->prev, struct page, lru); @@ -4174,12 +4226,12 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ); + &bio, 0, &bio_flags, READ, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ); + &bio, 0, &bio_flags, READ, &prev_em_start); if (em_cached) free_extent_map(em_cached); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b823fac91c92..8c6f247ba81d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2584,7 +2584,7 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start); if (ret) goto out; - } else { + } else if (offset + len > inode->i_size) { /* * If we are fallocating from the end of the file onward we * need to zero out the end of the page if i_size lands in the diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0fa7253a2d7..611b66d73e80 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5084,7 +5084,8 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (!special_file(inode->i_mode)) + btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(inode, 0, (u64)-1); @@ -7408,6 +7409,10 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; +}; static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -7415,10 +7420,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; + struct btrfs_dio_data *dio_data = NULL; u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7436,7 +7441,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * that anything that needs to check if there's a transction doesn't get * confused. */ - outstanding_extents = current->journal_info; + dio_data = current->journal_info; current->journal_info = NULL; } @@ -7568,17 +7573,18 @@ unlock: * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (*outstanding_extents) { - (*outstanding_extents)--; + if (dio_data->outstanding_extents) { + (dio_data->outstanding_extents)--; } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); - set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); + WARN_ON(dio_data->reserve < len); + dio_data->reserve -= len; + current->journal_info = dio_data; } /* @@ -7601,8 +7607,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); - if (outstanding_extents) - current->journal_info = outstanding_extents; + if (dio_data) + current->journal_info = dio_data; return ret; } @@ -8329,7 +8335,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - u64 outstanding_extents = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_data dio_data = { 0 }; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8367,7 +8374,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; - outstanding_extents = div64_u64(count + + dio_data.outstanding_extents = div64_u64(count + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); @@ -8376,7 +8383,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * do the accounting properly if we go over the number we * originally calculated. Abuse current->journal_info for this. */ - current->journal_info = &outstanding_extents; + dio_data.reserve = round_up(count, root->sectorsize); + current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_end(inode); @@ -8391,16 +8399,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { - /* - * If the error comes from submitting stage, - * btrfs_get_blocsk_direct() has free'd data space, - * and metadata space will be handled by - * finish_ordered_fn, don't do that again to make - * sure bytes_may_use is correct. - */ - if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, - &BTRFS_I(inode)->runtime_flags)) - btrfs_delalloc_release_space(inode, count); + if (dio_data.reserve) + btrfs_delalloc_release_space(inode, + dio_data.reserve); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0adf5422fce9..8d20f3b1cab0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4639,6 +4639,11 @@ locked: bctl->flags |= BTRFS_BALANCE_TYPE_MASK; } + if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_bctl; + } + do_balance: /* * Ownership of bctl and mutually_exclusive_operation_running @@ -4650,12 +4655,15 @@ do_balance: need_unlock = false; ret = btrfs_balance(bctl, bargs); + bctl = NULL; if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } +out_bctl: + kfree(bctl); out_bargs: kfree(bargs); out_unlock: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index aa72bfd28f7d..a739b825bdd3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1920,10 +1920,12 @@ static int did_overwrite_ref(struct send_ctx *sctx, /* * We know that it is or will be overwritten. Check this now. * The current inode being processed might have been the one that caused - * inode 'ino' to be orphanized, therefore ow_inode can actually be the - * same as sctx->send_progress. + * inode 'ino' to be orphanized, therefore check if ow_inode matches + * the current inode being processed. */ - if (ow_inode <= sctx->send_progress) + if ((ow_inode < sctx->send_progress) || + (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && + gen == sctx->cur_inode_gen)) ret = 1; else ret = 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2b07b3581781..11d1eab9234d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1658,9 +1658,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * groups on disk until we're mounted read-write again * unless we clean them up here. */ - mutex_lock(&root->fs_info->cleaner_mutex); btrfs_delete_unused_bgs(fs_info); - mutex_unlock(&root->fs_info->cleaner_mutex); btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8f259b3a66b3..a5b06442f0bf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -117,6 +117,18 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); } + + /* We can free old roots now. */ + spin_lock(&trans->dropped_roots_lock); + while (!list_empty(&trans->dropped_roots)) { + root = list_first_entry(&trans->dropped_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + spin_unlock(&trans->dropped_roots_lock); + btrfs_drop_and_free_fs_root(fs_info, root); + spin_lock(&trans->dropped_roots_lock); + } + spin_unlock(&trans->dropped_roots_lock); up_write(&fs_info->commit_root_sem); } @@ -255,11 +267,13 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); + INIT_LIST_HEAD(&cur_trans->dropped_roots); mutex_init(&cur_trans->cache_write_mutex); cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->deleted_bgs_lock); + spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -336,6 +350,24 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, } +void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + /* Add ourselves to the transaction dropped list */ + spin_lock(&cur_trans->dropped_roots_lock); + list_add_tail(&root->root_list, &cur_trans->dropped_roots); + spin_unlock(&cur_trans->dropped_roots_lock); + + /* Make sure we don't try to update the root at commit time */ + spin_lock(&root->fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +} + int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -525,6 +557,7 @@ again: h->delayed_ref_elem.seq = 0; h->type = type; h->allocating_chunk = false; + h->can_flush_pending_bgs = true; h->reloc_reserved = false; h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index edc2fbc262d7..a994bb097ee5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -65,6 +65,7 @@ struct btrfs_transaction { struct list_head switch_commits; struct list_head dirty_bgs; struct list_head io_bgs; + struct list_head dropped_roots; u64 num_dirty_bgs; /* @@ -76,6 +77,7 @@ struct btrfs_transaction { spinlock_t dirty_bgs_lock; struct list_head deleted_bgs; spinlock_t deleted_bgs_lock; + spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; int dirty_bg_run; @@ -116,6 +118,7 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; + bool can_flush_pending_bgs; bool reloc_reserved; bool sync; unsigned int type; @@ -216,5 +219,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); - +void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); #endif diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2ca784a14e84..595279a8b99f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -376,6 +376,14 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) #define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_MASK \ + (BTRFS_BALANCE_ARGS_PROFILES | \ + BTRFS_BALANCE_ARGS_USAGE | \ + BTRFS_BALANCE_ARGS_DEVID | \ + BTRFS_BALANCE_ARGS_DRANGE | \ + BTRFS_BALANCE_ARGS_VRANGE | \ + BTRFS_BALANCE_ARGS_LIMIT) + /* * Profile changing flags. When SOFT is set we won't relocate chunk if * it already has the target profile (even though it may be diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 6706bde9ad1b..a2cb0c254060 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -228,12 +228,12 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, lock_cmd, wait, fl); if (!err) { - err = flock_lock_file_wait(file, fl); + err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on flock_lock_file_wait, undid lock", err); + dout("got %d on locks_lock_file_wait, undid lock", err); } } return err; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index f4cf200b3c76..6908080e9b6d 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -42,7 +42,7 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) goto error; /* attach the data */ - key->payload.data = payload; + key->payload.data[0] = payload; ret = 0; error: @@ -52,7 +52,7 @@ error: static void cifs_spnego_key_destroy(struct key *key) { - kfree(key->payload.data); + kfree(key->payload.data[0]); } @@ -167,7 +167,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { - struct cifs_spnego_msg *msg = spnego_key->payload.data; + struct cifs_spnego_msg *msg = spnego_key->payload.data[0]; cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U, msg->secblob_len + msg->sesskey_len)); } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1ea780bc6376..3f93125916bf 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -58,16 +58,15 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) * dereference payload.data! */ if (prep->datalen <= sizeof(key->payload)) { - key->payload.value = 0; - memcpy(&key->payload.value, prep->data, prep->datalen); - key->datalen = prep->datalen; - return 0; + key->payload.data[0] = NULL; + memcpy(&key->payload, prep->data, prep->datalen); + } else { + payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!payload) + return -ENOMEM; + key->payload.data[0] = payload; } - payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); - if (!payload) - return -ENOMEM; - key->payload.data = payload; key->datalen = prep->datalen; return 0; } @@ -76,7 +75,7 @@ static inline void cifs_idmap_key_destroy(struct key *key) { if (key->datalen > sizeof(key->payload)) - kfree(key->payload.data); + kfree(key->payload.data[0]); } static struct key_type cifs_idmap_key_type = { @@ -233,8 +232,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) * it could be. */ ksid = sidkey->datalen <= sizeof(sidkey->payload) ? - (struct cifs_sid *)&sidkey->payload.value : - (struct cifs_sid *)sidkey->payload.data; + (struct cifs_sid *)&sidkey->payload : + (struct cifs_sid *)sidkey->payload.data[0]; ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { @@ -307,14 +306,14 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, if (sidtype == SIDOWNER) { kuid_t uid; uid_t id; - memcpy(&id, &sidkey->payload.value, sizeof(uid_t)); + memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t)); uid = make_kuid(&init_user_ns, id); if (uid_valid(uid)) fuid = uid; } else { kgid_t gid; gid_t id; - memcpy(&id, &sidkey->payload.value, sizeof(gid_t)); + memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t)); gid = make_kgid(&init_user_ns, id); if (gid_valid(gid)) fgid = gid; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index aa0dc2573374..afa09fce8151 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -444,6 +444,48 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) return 0; } +/* Server has provided av pairs/target info in the type 2 challenge + * packet and we have plucked it and stored within smb session. + * We parse that blob here to find the server given timestamp + * as part of ntlmv2 authentication (or local current time as + * default in case of failure) + */ +static __le64 +find_timestamp(struct cifs_ses *ses) +{ + unsigned int attrsize; + unsigned int type; + unsigned int onesize = sizeof(struct ntlmssp2_name); + unsigned char *blobptr; + unsigned char *blobend; + struct ntlmssp2_name *attrptr; + + if (!ses->auth_key.len || !ses->auth_key.response) + return 0; + + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; + + while (blobptr + onesize < blobend) { + attrptr = (struct ntlmssp2_name *) blobptr; + type = le16_to_cpu(attrptr->type); + if (type == NTLMSSP_AV_EOL) + break; + blobptr += 2; /* advance attr type */ + attrsize = le16_to_cpu(attrptr->length); + blobptr += 2; /* advance attr size */ + if (blobptr + attrsize > blobend) + break; + if (type == NTLMSSP_AV_TIMESTAMP) { + if (attrsize == sizeof(u64)) + return *((__le64 *)blobptr); + } + blobptr += attrsize; /* advance attr value */ + } + + return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); +} + static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, const struct nls_table *nls_cp) { @@ -641,6 +683,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) struct ntlmv2_resp *ntlmv2; char ntlmv2_hash[16]; unsigned char *tiblob = NULL; /* target info blob */ + __le64 rsp_timestamp; if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) { if (!ses->domainName) { @@ -659,6 +702,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } } + /* Must be within 5 minutes of the server (or in range +/-2h + * in case of Mac OS X), so simply carry over server timestamp + * (as Windows 7 does) + */ + rsp_timestamp = find_timestamp(ses); + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); tilen = ses->auth_key.len; tiblob = ses->auth_key.response; @@ -675,8 +724,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); ntlmv2->blob_signature = cpu_to_le32(0x00000101); ntlmv2->reserved = 0; - /* Must be within 5 minutes of the server */ - ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + ntlmv2->time = rsp_timestamp; + get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal)); ntlmv2->reserved2 = 0; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 27aea110e923..c3cc1609025f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.07" +#define CIFS_VERSION "2.08" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 773f4dc77630..3f2228570d44 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2325,13 +2325,14 @@ static int cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) { int rc = 0; - char *desc, *delim, *payload; + const char *delim, *payload; + char *desc; ssize_t len; struct key *key; struct TCP_Server_Info *server = ses->server; struct sockaddr_in *sa; struct sockaddr_in6 *sa6; - struct user_key_payload *upayload; + const struct user_key_payload *upayload; desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL); if (!desc) @@ -2374,14 +2375,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } down_read(&key->sem); - upayload = key->payload.data; + upayload = user_key_payload(key); if (IS_ERR_OR_NULL(upayload)) { rc = upayload ? PTR_ERR(upayload) : -EINVAL; goto out_key_put; } /* find first : in payload */ - payload = (char *)upayload->data; + payload = upayload->data; delim = strnchr(payload, upayload->datalen, ':'); cifs_dbg(FYI, "payload=%s\n", payload); if (!delim) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e2a6af1508af..47c5c97e2dd3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1553,7 +1553,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, out: if (flock->fl_flags & FL_POSIX && !rc) - rc = posix_lock_file_wait(file, flock); + rc = locks_lock_file_wait(file, flock); return rc; } @@ -3380,6 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; + gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); INIT_LIST_HEAD(tmplist); @@ -3392,7 +3393,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, */ __set_page_locked(page); rc = add_to_page_cache_locked(page, mapping, - page->index, GFP_KERNEL); + page->index, gfp); /* give up if we can't stick it in the cache */ if (rc) { @@ -3418,8 +3419,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, break; __set_page_locked(page); - if (add_to_page_cache_locked(page, mapping, page->index, - GFP_KERNEL)) { + if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { __clear_page_locked(page); break; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f621b44cb800..6b66dd5d1540 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2034,7 +2034,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, struct tcon_link *tlink = NULL; struct cifs_tcon *tcon = NULL; struct TCP_Server_Info *server; - struct cifs_io_parms io_parms; /* * To avoid spurious oplock breaks from server, in the case of @@ -2056,18 +2055,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, rc = -ENOSYS; cifsFileInfo_put(open_file); cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc); - if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { - unsigned int bytes_written; - - io_parms.netfid = open_file->fid.netfid; - io_parms.pid = open_file->pid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = attrs->ia_size; - rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, - NULL, NULL, 1); - cifs_dbg(FYI, "Wrt seteof rc %d\n", rc); - } } else rc = -EINVAL; @@ -2093,28 +2080,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, else rc = -ENOSYS; cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc); - if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { - __u16 netfid; - int oplock = 0; - rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN, - GENERIC_WRITE, CREATE_NOT_DIR, &netfid, - &oplock, NULL, cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc == 0) { - unsigned int bytes_written; - - io_parms.netfid = netfid; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = attrs->ia_size; - rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL, - NULL, 1); - cifs_dbg(FYI, "wrt seteof rc %d\n", rc); - CIFSSMBClose(xid, tcon, netfid); - } - } if (tlink) cifs_put_tlink(tlink); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index bce6fdcd5d48..59727e32ed0f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -988,7 +988,7 @@ sess_auth_kerberos(struct sess_data *sess_data) goto out; } - msg = spnego_key->payload.data; + msg = spnego_key->payload.data[0]; /* * check version field to make sure that cifs.upcall is * sending us a response in an expected form diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index df91bcf56d67..18da19f4f811 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -50,9 +50,13 @@ change_conf(struct TCP_Server_Info *server) break; default: server->echoes = true; - server->oplocks = true; + if (enable_oplocks) { + server->oplocks = true; + server->oplock_credits = 1; + } else + server->oplocks = false; + server->echo_credits = 1; - server->oplock_credits = 1; } server->credits -= server->echo_credits + server->oplock_credits; return 0; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 070fb2ad85ce..61276929d139 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -46,6 +46,7 @@ #include "smb2status.h" #include "smb2glob.h" #include "cifspdu.h" +#include "cifs_spnego.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -486,19 +487,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_dbg(FYI, "missing security blob on negprot\n"); rc = cifs_enable_signing(server, ses->sign); -#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ if (rc) goto neg_exit; - if (blob_length) + if (blob_length) { rc = decode_negTokenInit(security_blob, blob_length, server); - if (rc == 1) - rc = 0; - else if (rc == 0) { - rc = -EIO; - goto neg_exit; + if (rc == 1) + rc = 0; + else if (rc == 0) + rc = -EIO; } -#endif - neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -592,7 +589,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; u16 blob_length = 0; - char *security_blob; + struct key *spnego_key = NULL; + char *security_blob = NULL; char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ @@ -620,7 +618,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, ses->ntlmssp->sesskey_per_smbsess = true; /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ - ses->sectype = RawNTLMSSP; + if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) + ses->sectype = RawNTLMSSP; ssetup_ntlmssp_authenticate: if (phase == NtLmChallenge) @@ -649,7 +648,48 @@ ssetup_ntlmssp_authenticate: iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field and 1 for pad */ iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; - if (phase == NtLmNegotiate) { + + if (ses->sectype == Kerberos) { +#ifdef CONFIG_CIFS_UPCALL + struct cifs_spnego_msg *msg; + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto ssetup_exit; + } + + msg = spnego_key->payload.data[0]; + /* + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form + */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "bad cifs.upcall version. Expected %d got %d", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto ssetup_exit; + } + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto ssetup_exit; + } + ses->auth_key.len = msg->sesskey_len; + blob_length = msg->secblob_len; + iov[1].iov_base = msg->data + msg->sesskey_len; + iov[1].iov_len = blob_length; +#else + rc = -EOPNOTSUPP; + goto ssetup_exit; +#endif /* CONFIG_CIFS_UPCALL */ + } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */ ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), GFP_KERNEL); if (ntlmssp_blob == NULL) { @@ -672,6 +712,8 @@ ssetup_ntlmssp_authenticate: /* with raw NTLMSSP we don't encapsulate in SPNEGO */ security_blob = ntlmssp_blob; } + iov[1].iov_base = security_blob; + iov[1].iov_len = blob_length; } else if (phase == NtLmAuthenticate) { req->hdr.SessionId = ses->Suid; ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, @@ -699,6 +741,8 @@ ssetup_ntlmssp_authenticate: } else { security_blob = ntlmssp_blob; } + iov[1].iov_base = security_blob; + iov[1].iov_len = blob_length; } else { cifs_dbg(VFS, "illegal ntlmssp phase\n"); rc = -EIO; @@ -710,8 +754,6 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */ - 4 /* rfc1001 len */); req->SecurityBufferLength = cpu_to_le16(blob_length); - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; inc_rfc1001_len(req, blob_length - 1 /* pad */); @@ -722,6 +764,7 @@ ssetup_ntlmssp_authenticate: kfree(security_blob); rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; + ses->Suid = rsp->hdr.SessionId; if (resp_buftype != CIFS_NO_BUFFER && rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { if (phase != NtLmNegotiate) { @@ -739,7 +782,6 @@ ssetup_ntlmssp_authenticate: /* NTLMSSP Negotiate sent now processing challenge (response) */ phase = NtLmChallenge; /* process ntlmssp challenge */ rc = 0; /* MORE_PROCESSING is not an error here but expected */ - ses->Suid = rsp->hdr.SessionId; rc = decode_ntlmssp_challenge(rsp->Buffer, le16_to_cpu(rsp->SecurityBufferLength), ses); } @@ -796,6 +838,10 @@ keygen_exit: kfree(ses->auth_key.response); ses->auth_key.response = NULL; } + if (spnego_key) { + key_invalidate(spnego_key); + key_put(spnego_key); + } kfree(ses->ntlmssp); return rc; @@ -876,6 +922,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if (tcon && tcon->bad_network_name) return -ENOENT; + if ((tcon && tcon->seal) && + ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) { + cifs_dbg(VFS, "encryption requested but no server support"); + return -EOPNOTSUPP; + } + unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL); if (unc_path == NULL) return -ENOMEM; @@ -955,6 +1007,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); init_copy_chunk_defaults(tcon); + if (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA) + cifs_dbg(VFS, "Encrypted shares not supported"); if (tcon->ses->server->ops->validate_negotiate) rc = tcon->ses->server->ops->validate_negotiate(xid, tcon); tcon_exit: @@ -285,6 +285,7 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { + struct address_space *mapping = inode->i_mapping; sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; void __pmem *addr; @@ -292,6 +293,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, pgoff_t size; int error; + i_mmap_lock_read(mapping); + /* * Check truncate didn't happen while we were allocating a block. * If it did, this block may or may not be still allocated to the @@ -321,6 +324,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, error = vm_insert_mixed(vma, vaddr, pfn); out: + i_mmap_unlock_read(mapping); + return error; } @@ -382,17 +387,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * from a read fault and we've raced with a truncate */ error = -EIO; - goto unlock; + goto unlock_page; } - } else { - i_mmap_lock_write(mapping); } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock; + goto unlock_page; if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { @@ -403,9 +406,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; if (error) - goto unlock; + goto unlock_page; } else { - i_mmap_unlock_write(mapping); return dax_load_hole(mapping, page, vmf); } } @@ -417,15 +419,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock; + goto unlock_page; vmf->page = page; if (!page) { + i_mmap_lock_read(mapping); /* Check we didn't race with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); error = -EIO; - goto unlock; + goto out; } } return VM_FAULT_LOCKED; @@ -461,8 +465,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); } - if (!page) - i_mmap_unlock_write(mapping); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -471,14 +473,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - unlock: + unlock_page: if (page) { unlock_page(page); page_cache_release(page); - } else { - i_mmap_unlock_write(mapping); } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -556,10 +555,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - i_mmap_lock_write(mapping); length = get_block(inode, block, &bh, write); if (length) return VM_FAULT_SIGBUS; + i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -569,24 +568,14 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) goto fallback; - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - int i; - for (i = 0; i < PTRS_PER_PMD; i++) - clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } - /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ if (buffer_new(&bh)) { - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); } /* @@ -633,15 +622,25 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) goto fallback; + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + int i; + for (i = 0; i < PTRS_PER_PMD; i++) + clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + wmb_pmem(); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + result |= VM_FAULT_MAJOR; + } + result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); } out: + i_mmap_unlock_read(mapping); + if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - i_mmap_unlock_write(mapping); - return result; fallback: diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 6c55ade071c3..d2ba12e23ed9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -42,6 +42,22 @@ const struct file_operations debugfs_file_operations = { .llseek = noop_llseek, }; +static struct dentry *debugfs_create_mode(const char *name, umode_t mode, + struct dentry *parent, void *value, + const struct file_operations *fops, + const struct file_operations *fops_ro, + const struct file_operations *fops_wo) +{ + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, fops_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, fops_wo); + + return debugfs_create_file(name, mode, parent, value, fops); +} + static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; @@ -83,14 +99,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u8); + return debugfs_create_mode(name, mode, parent, value, &fops_u8, + &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -135,14 +145,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u16); + return debugfs_create_mode(name, mode, parent, value, &fops_u16, + &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -187,14 +191,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u32); + return debugfs_create_mode(name, mode, parent, value, &fops_u32, + &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -240,17 +238,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_u64); + return debugfs_create_mode(name, mode, parent, value, &fops_u64, + &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); +static int debugfs_ulong_set(void *data, u64 val) +{ + *(unsigned long *)data = val; + return 0; +} + +static int debugfs_ulong_get(void *data, u64 *val) +{ + *val = *(unsigned long *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); + +/** + * debugfs_create_ulong - create a debugfs file that is used to read and write + * an unsigned long value. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. It is not wise to check for this value, but rather, check for + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling + * code. + */ +struct dentry *debugfs_create_ulong(const char *name, umode_t mode, + struct dentry *parent, unsigned long *value) +{ + return debugfs_create_mode(name, mode, parent, value, &fops_ulong, + &fops_ulong_ro, &fops_ulong_wo); +} +EXPORT_SYMBOL_GPL(debugfs_create_ulong); + DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); @@ -264,6 +304,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value @@ -286,14 +328,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x8); + return debugfs_create_mode(name, mode, parent, value, &fops_x8, + &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -310,14 +346,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x16); + return debugfs_create_mode(name, mode, parent, value, &fops_x16, + &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -334,14 +364,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_x32); + return debugfs_create_mode(name, mode, parent, value, &fops_x32, + &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -358,7 +382,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_file(name, mode, parent, value, &fops_x64); + return debugfs_create_mode(name, mode, parent, value, &fops_x64, + &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -375,6 +400,8 @@ static int debugfs_size_t_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value @@ -389,7 +416,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { - return debugfs_create_file(name, mode, parent, value, &fops_size_t); + return debugfs_create_mode(name, mode, parent, value, &fops_size_t, + &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -422,16 +450,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { - /* if there are no write bits set, make read only */ - if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, - &fops_atomic_t_ro); - /* if there are no read bits set, make write only */ - if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, - &fops_atomic_t_wo); - - return debugfs_create_file(name, mode, parent, value, &fops_atomic_t); + return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t, + &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); @@ -439,7 +459,7 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; - u32 *val = file->private_data; + bool *val = file->private_data; if (*val) buf[0] = 'Y'; @@ -457,7 +477,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size; bool bv; - u32 *val = file->private_data; + bool *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -478,6 +498,18 @@ static const struct file_operations fops_bool = { .llseek = default_llseek, }; +static const struct file_operations fops_bool_ro = { + .read = debugfs_read_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_bool_wo = { + .write = debugfs_write_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. @@ -503,9 +535,10 @@ static const struct file_operations fops_bool = { * code. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, - struct dentry *parent, u32 *value) + struct dentry *parent, bool *value) { - return debugfs_create_file(name, mode, parent, value, &fops_bool); + return debugfs_create_mode(name, mode, parent, value, &fops_bool, + &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index c711be8d6a3c..5d8f35f1382a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -533,7 +533,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) /** * debugfs_remove - removes a file or directory from the debugfs filesystem * @dentry: a pointer to a the dentry of the file or directory to be - * removed. + * removed. If this parameter is NULL or an error value, nothing + * will be done. * * This function removes a file or directory in debugfs that was previously * created with a call to another debugfs function (like @@ -565,7 +566,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_remove_recursive - recursively removes a directory - * @dentry: a pointer to a the dentry of the directory to be removed. + * @dentry: a pointer to a the dentry of the directory to be removed. If this + * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function diff --git a/fs/direct-io.c b/fs/direct-io.c index 11256291642e..3ae0e0427191 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -120,6 +120,7 @@ struct dio { int page_errors; /* errno from get_user_pages() */ int is_async; /* is IO async ? */ bool defer_completion; /* defer AIO completion to workqueue? */ + bool should_dirty; /* if pages should be dirtied */ int io_error; /* IO error in completion path */ unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ @@ -393,7 +394,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ) + if (dio->is_async && dio->rw == READ && dio->should_dirty) bio_set_pages_dirty(bio); if (sdio->submit_io) @@ -464,14 +465,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (bio->bi_error) dio->io_error = -EIO; - if (dio->is_async && dio->rw == READ) { + if (dio->is_async && dio->rw == READ && dio->should_dirty) { bio_check_pages_dirty(bio); /* transfers ownership */ err = bio->bi_error; } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (dio->rw == READ && !PageCompound(page)) + if (dio->rw == READ && !PageCompound(page) && + dio->should_dirty) set_page_dirty_lock(page); page_cache_release(page); } @@ -1219,6 +1221,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, spin_lock_init(&dio->bio_lock); dio->refcount = 1; + dio->should_dirty = (iter->type == ITER_IOVEC); sdio.iter = iter; sdio.final_block_in_request = (offset + iov_iter_count(iter)) >> blkbits; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 5532f097f6da..d401425f602a 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -145,7 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); if (xop->callback == NULL) { - rv = wait_event_killable(recv_wq, (op->done != 0)); + rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { log_debug(ls, "dlm_posix_lock: wait killed %llx", (unsigned long long)number); @@ -172,7 +172,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, rv = op->info.rv; if (!rv) { - if (posix_lock_file_wait(file, fl) < 0) + if (locks_lock_file_wait(file, fl) < 0) log_error(ls, "dlm_posix_lock: vfs lock error %llx", (unsigned long long)number); } @@ -262,7 +262,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, /* cause the vfs unlock to return ENOENT if lock is not found */ fl->fl_flags |= FL_EXISTS; - rv = posix_lock_file_wait(file, fl); + rv = locks_lock_file_wait(file, fl); if (rv == -ENOENT) { rv = 0; goto out_free; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 5ba029e627cc..7b39260c7bba 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -86,7 +86,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key) { if (key->type == &key_type_encrypted) return (struct ecryptfs_auth_tok *) - (&((struct encrypted_key_payload *)key->payload.data)->payload_data); + (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data); else return NULL; } @@ -117,8 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key) auth_tok = ecryptfs_get_encrypted_key_payload_data(key); if (!auth_tok) - return (struct ecryptfs_auth_tok *) - (((struct user_key_payload *)key->payload.data)->data); + return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; else return auth_tok; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 47728da7702c..b46e9fc64196 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -63,7 +63,7 @@ config EXT4_FS If unsure, say N. config EXT4_USE_FOR_EXT2 - bool "Use ext4 for ext2/ext3 file systems" + bool "Use ext4 for ext2 file systems" depends on EXT4_FS depends on EXT2_FS=n default y diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index f9270ec2a132..c5882b36e558 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -120,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode) struct key *keyring_key = NULL; struct ext4_encryption_key *master_key; struct ext4_encryption_context ctx; - struct user_key_payload *ukp; + const struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct crypto_ablkcipher *ctfm; const char *cipher_str; @@ -213,7 +213,7 @@ retry: res = -ENOKEY; goto out; } - ukp = ((struct user_key_payload *)keyring_key->payload.data); + ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; goto out; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a91e4bf6ebc4..d94af71a4e7f 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -165,8 +165,8 @@ int ext4_mpage_readpages(struct address_space *mapping, if (pages) { page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) + if (add_to_page_cache_lru(page, mapping, page->index, + GFP_KERNEL & mapping_gfp_mask(mapping))) goto next_page; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c5a38e352a80..f661d80474be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -47,7 +47,8 @@ repeat: /* * We guarantee no failure on the returned page. */ -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; @@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) .blk_addr = index, .encrypted_page = NULL, }; + + if (unlikely(!is_meta)) + fio.rw &= ~REQ_META; repeat: page = grab_cache_page(mapping, index); if (!page) { @@ -91,6 +95,17 @@ out: return page; } +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, true); +} + +/* for POR only */ +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, false); +} + bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -125,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) { block_t prev_blk_addr = 0; struct page *page; @@ -133,10 +149,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; + if (unlikely(type == META_POR)) + fio.rw &= ~REQ_META; + for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) @@ -196,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); } static int f2fs_write_meta_page(struct page *page, @@ -257,7 +276,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { @@ -277,6 +296,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (prev == LONG_MAX) + prev = page->index - 1; + if (nr_to_write != LONG_MAX && page->index != prev + 1) { + pagevec_release(&pvec); + goto stop; + } + lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -297,13 +323,14 @@ continue_unlock: break; } nwritten++; + prev = page->index; if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); cond_resched(); } - +stop: if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); @@ -495,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { struct page *page = get_meta_page(sbi, start_blk + i); @@ -1000,6 +1027,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_addr(sbi); + /* need to wait for end_io results */ + wait_on_all_pages_writeback(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1109,6 +1141,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason == CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); + + /* do checkpoint periodically */ + sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); out: mutex_unlock(&sbi->cp_mutex); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 9f77de2ef317..5de2d866a25c 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -122,7 +122,7 @@ int _f2fs_get_encryption_info(struct inode *inode) struct key *keyring_key = NULL; struct f2fs_encryption_key *master_key; struct f2fs_encryption_context ctx; - struct user_key_payload *ukp; + const struct user_key_payload *ukp; struct crypto_ablkcipher *ctfm; const char *cipher_str; char raw_key[F2FS_MAX_KEY_SIZE]; @@ -199,7 +199,7 @@ retry: } crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); - ukp = ((struct user_key_payload *)keyring_key->payload.data); + ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { res = -EINVAL; goto out; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a82abe921b89..972eab7ac071 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -275,7 +275,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int rw, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -292,7 +293,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return read_mapping_page(mapping, index, NULL); - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); @@ -352,7 +353,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, false); if (IS_ERR(page)) return page; @@ -372,12 +373,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -411,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode, struct dnode_of_data dn; int err; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released @@ -439,7 +441,7 @@ repeat: } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, true); if (IS_ERR(page)) goto repeat; @@ -447,9 +449,9 @@ repeat: lock_page(page); } got_it: - if (new_i_size && - i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { + i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } @@ -489,8 +491,9 @@ alloc: /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); /* direct IO doesn't use extent cache to maximize the performance */ f2fs_drop_largest_extent(dn->inode, fofs); @@ -523,6 +526,9 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; + if (unlikely(f2fs_cp_error(sbi))) + goto sync_out; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { if (__allocate_data_block(&dn)) @@ -565,6 +571,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; int err = 0, ofs = 1; @@ -595,40 +602,40 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) { - err = -ENOENT; - goto put_out; - } else if (flag == F2FS_GET_BLOCK_READ || - flag == F2FS_GET_BLOCK_DIO) { - goto put_out; + + if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto put_out; + } + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map->m_flags = F2FS_MAP_NEW; + } else { + if (flag != F2FS_GET_BLOCK_FIEMAP || + dn.data_blkaddr != NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; + goto put_out; + } + + /* + * preallocated unwritten block should be mapped + * for fiemap. + */ + if (dn.data_blkaddr == NEW_ADDR) + map->m_flags = F2FS_MAP_UNWRITTEN; } - /* - * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), - * mark it as mapped and unwritten block. - */ } - if (dn.data_blkaddr != NULL_ADDR) { - map->m_flags = F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags |= F2FS_MAP_UNWRITTEN; - } else if (create) { - err = __allocate_data_block(&dn); - if (err) - goto put_out; - allocated = true; - map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - } else { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; - } + map->m_flags |= F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; + map->m_len = 1; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - map->m_len = 1; dn.ofs_in_node++; pgofs++; @@ -647,23 +654,35 @@ get_next: goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && - flag != F2FS_GET_BLOCK_FIEMAP) - goto put_out; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } if (maxblocks > map->m_len) { block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR && create) { - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; + } } + /* Give more consecutive addresses for the readahead */ if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || @@ -752,6 +771,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + if (f2fs_has_inline_data(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + return ret; + } + mutex_lock(&inode->i_mutex); if (len >= isize) { @@ -903,7 +928,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, false)) + if (f2fs_map_blocks(inode, &map, 0, + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: @@ -936,21 +962,14 @@ submit_and_realloc: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - struct page *cpage; ctx = f2fs_get_crypto_ctx(inode); if (IS_ERR(ctx)) goto set_error_page; /* wait the page to be moved by cleaning */ - cpage = find_lock_page( - META_MAPPING(F2FS_I_SB(inode)), - block_nr); - if (cpage) { - f2fs_wait_on_page_writeback(cpage, - DATA); - f2fs_put_page(cpage, 1); - } + f2fs_wait_on_encrypted_page_writeback( + F2FS_I_SB(inode), block_nr); } bio = bio_alloc(GFP_KERNEL, @@ -1012,6 +1031,9 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; + struct page *page = list_entry(pages->prev, struct page, lru); + + trace_f2fs_readpages(inode, page, nr_pages); /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) @@ -1041,6 +1063,11 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), + fio->blk_addr); + fio->encrypted_page = f2fs_encrypt(inode, fio->page); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); @@ -1429,6 +1456,10 @@ put_next: f2fs_wait_on_page_writeback(page, DATA); + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (len == PAGE_CACHE_SIZE) goto out_update; if (PageUptodate(page)) @@ -1551,10 +1582,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == WRITE) + if (iov_iter_rw(iter) == WRITE) { __allocate_data_blocks(inode, offset, count); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + err = -EIO; + goto out; + } + } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); +out: if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); @@ -1636,12 +1673,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + if (f2fs_has_inline_data(inode)) + return 0; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + return generic_block_bmap(mapping, block, get_data_block_bmap); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d013d8479753..478e5d54154f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,11 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_largest = atomic_read(&sbi->read_hit_largest); - si->hit_cached = atomic_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_cached = atomic64_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic_read(&sbi->total_hit_ext); + si->total_ext = atomic64_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div_u64(bimodal, dist); + si->bimodal = div64_u64(bimodal, dist); if (si->dirty_count) si->avg_vblocks = div_u64(total_vblocks, ndirty); else @@ -198,9 +198,9 @@ get_cache: si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -283,12 +283,12 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); seq_puts(s, "\nExtent Cache:\n"); - seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, si->hit_rbtree); - seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", !si->total_ext ? 0 : - (si->hit_total * 100) / si->total_ext, + div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", si->ext_tree, si->ext_node); @@ -333,13 +333,13 @@ static int stat_show(struct seq_file *s, void *v) /* memory footprint */ update_mem_info(si->sbi); - seq_printf(s, "\nMemory: %u KB\n", + seq_printf(s, "\nMemory: %llu KB\n", (si->base_mem + si->cache_mem + si->page_mem) >> 10); - seq_printf(s, " - static: %u KB\n", + seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %u KB\n", + seq_printf(s, " - cached: %llu KB\n", si->cache_mem >> 10); - seq_printf(s, " - paged : %u KB\n", + seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } mutex_unlock(&f2fs_stat_mutex); @@ -378,10 +378,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic_set(&sbi->total_hit_ext, 0); - atomic_set(&sbi->read_hit_rbtree, 0); - atomic_set(&sbi->read_hit_largest, 0); - atomic_set(&sbi->read_hit_cached, 0); + atomic64_set(&sbi->total_hit_ext, 0); + atomic64_set(&sbi->read_hit_rbtree, 0); + atomic64_set(&sbi->read_hit_largest, 0); + atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8f15fc134040..7c1678ba8f92 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -258,7 +258,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) if (f2fs_has_inline_dentry(dir)) return f2fs_parent_inline_dir(dir, p); - page = get_lock_data_page(dir, 0); + page = get_lock_data_page(dir, 0, false); if (IS_ERR(page)) return NULL; @@ -740,7 +740,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx); + dentry_page = get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -787,7 +787,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, else d_type = DT_UNKNOWN; - /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -795,12 +794,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; + de_name.name = kmalloc(de_name.len, GFP_NOFS); + if (!de_name.name) + return false; + + memcpy(de_name.name, d->filename[bit_pos], de_name.len); + ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, &de_name, fstr); - de_name = *fstr; - fstr->len = save_len; + kfree(de_name.name); if (ret < 0) return true; + + de_name = *fstr; + fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, @@ -847,7 +854,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); for (; n < npages; n++) { - dentry_page = get_lock_data_page(inode, n); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) continue; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 997ac86f2a1d..7ddba812e11b 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -155,11 +155,12 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - et->count; } -static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +static void __drop_largest_extent(struct inode *inode, + pgoff_t fofs, unsigned int len) { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) largest->len = 0; } @@ -168,7 +169,7 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) if (!f2fs_may_extent_tree(inode)) return; - __drop_largest_extent(inode, fofs); + __drop_largest_extent(inode, fofs, 1); } void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) @@ -350,8 +351,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (en) { - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; } return en; @@ -388,18 +388,17 @@ do_insert: if (!en) return NULL; - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; return en; } -unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static unsigned int f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; @@ -409,6 +408,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!et) return false; + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + write_lock(&et->lock); if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { @@ -419,148 +420,99 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, prev = et->largest; dei.len = 0; - /* we do not guarantee that the largest extent is cached all the time */ - __drop_largest_extent(inode, fofs); + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ + __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, &insert_p, &insert_parent); - if (!en) { - if (next_en) { - en = next_en; - f2fs_bug_on(sbi, en->ei.fofs <= pos); - pos = en->ei.fofs; - } else { - /* - * skip searching in the tree since there is no - * larger extent node in the cache. - */ - goto update_extent; - } - } + if (!en) + en = next_en; /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ - while (en) { - struct rb_node *node; + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ - if (pos >= end) - break; + next_en = en1 = NULL; dei = en->ei; - en1 = en2 = NULL; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, pos >= org_end); - node = rb_next(&en->rb_node); + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + en->ei.len = pos - en->ei.fofs; + prev_en = en; + parts = 1; + } - /* - * 2.1 there are four cases when we invalidate blkaddr in extent - * node, |V: valid address, X: will be invalidated| - */ - /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ - if (pos > dei.fofs && end >= dei.fofs + dei.len) { - en->ei.len = pos - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; + if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (parts) { + set_extent_info(&ei, end, + end - dei.fofs + dei.blk, + org_end - end); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + next_en = en1; + } else { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + next_en = en; } - - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + parts++; } - /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ - if (pos <= dei.fofs && end < dei.fofs + dei.len) { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; - } + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + next_en = node ? + rb_entry(node, struct extent_node, rb_node) + : NULL; } - __detach_extent_node(sbi, et, en); + if (parts) + __try_update_largest_extent(et, en); + else + __detach_extent_node(sbi, et, en); /* - * if we remove node in rb-tree, our parent node pointer may - * point the wrong place, discard them. + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. */ - insert_p = NULL; - insert_parent = NULL; - - /* case#3, invalidate entire extent node |XXXXXXXXXX| */ - if (pos <= dei.fofs && end >= dei.fofs + dei.len) { - if (__is_extent_same(&dei, &et->largest)) - et->largest.len = 0; - goto update; + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; } - /* - * case#4, invalidate data in the middle of extent node - * |VVVXXXXVVV| - */ - if (dei.len > F2FS_MIN_EXTENT_LEN) { - unsigned int endofs; - - /* insert left part of split extent into cache */ - if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - pos - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len; - if (endofs - end >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, end, - end - dei.fofs + dei.blk, - endofs - end); - en2 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } - } -update: - /* 2.2 update in global extent list */ + /* update in global extent list */ spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) + if (!parts && !list_empty(&en->list)) list_del(&en->list); if (en1) list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); - /* 2.3 release extent node */ - if (en) + /* release extent node */ + if (!parts) kmem_cache_free(extent_node_slab, en); -next: - en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; - next_en = en; - if (en) - pos = en->ei.fofs; + + en = next_en; } -update_extent: /* 3. update extent in extent cache */ if (blkaddr) { struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en3 = __try_merge_extent_node(sbi, et, &ei, &den, + en1 = __try_merge_extent_node(sbi, et, &ei, &den, prev_en, next_en); - if (!en3) - en3 = __insert_extent_tree(sbi, et, &ei, + if (!en1) + en1 = __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ @@ -572,11 +524,11 @@ update_extent: } spin_lock(&sbi->extent_lock); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); + if (en1) { + if (list_empty(&en1->list)) + list_add_tail(&en1->list, &sbi->extent_list); else - list_move_tail(&en3->list, &sbi->extent_list); + list_move_tail(&en1->list, &sbi->extent_list); } if (den && !list_empty(&den->list)) list_del(&den->list); @@ -650,6 +602,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&sbi->extent_lock); + /* + * reset ino for searching victims from beginning of global extent tree. + */ + ino = F2FS_ROOT_INO(sbi); + while ((found = radix_tree_gang_lookup(root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -663,7 +620,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) write_unlock(&et->lock); if (node_cnt + tree_cnt >= nr_shrink) - break; + goto unlock_out; } } unlock_out: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1a90ffd7cad..9db5500d63d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include <linux/magic.h> #include <linux/kobject.h> #include <linux/sched.h> +#include <linux/vmalloc.h> #include <linux/bio.h> #ifdef CONFIG_F2FS_CHECK_FS @@ -52,6 +53,7 @@ #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -122,6 +124,7 @@ enum { (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define DEF_CP_INTERVAL 60 /* 60 secs */ struct cp_control { int reason; @@ -230,6 +233,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -246,6 +250,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ #define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ #define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ +#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -492,12 +497,20 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len > et->largest.len) + et->largest = en->ei; +} + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -724,6 +737,7 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; + long cp_expires, cp_interval; /* next expected periodic cp */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -787,10 +801,10 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic_t total_hit_ext; /* # of lookup extent cache */ - atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic_t read_hit_largest; /* # of hit largest extent node */ - atomic_t read_hit_cached; /* # of hit cached extent node */ + atomic64_t total_hit_ext; /* # of lookup extent cache */ + atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic64_t read_hit_largest; /* # of hit largest extent node */ + atomic64_t read_hit_cached; /* # of hit cached extent node */ atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -1220,6 +1234,24 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) return sbi->total_valid_inode_count; } +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + if (!for_write) + return grab_cache_page(mapping, index); + return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); +} + +static inline void f2fs_copy_page(struct page *src, struct page *dst) +{ + char *src_kaddr = kmap(src); + char *dst_kaddr = kmap(dst); + + memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); + kunmap(dst); + kunmap(src); +} + static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) @@ -1579,6 +1611,26 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(mode); } +static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1721,6 +1773,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); @@ -1739,6 +1792,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, @@ -1754,8 +1808,9 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); @@ -1787,9 +1842,9 @@ void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int); +struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); @@ -1802,7 +1857,7 @@ int f2fs_release_page(struct page *, gfp_t); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); -int f2fs_gc(struct f2fs_sb_info *); +int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* @@ -1820,7 +1875,8 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; + unsigned long long hit_largest, hit_cached, hit_rbtree; + unsigned long long hit_total, total_ext; int ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; @@ -1844,7 +1900,7 @@ struct f2fs_stat_info { unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; - unsigned base_mem, cache_mem, page_mem; + unsigned long long base_mem, cache_mem, page_mem; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) @@ -1857,10 +1913,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) -#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) +#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ @@ -1998,6 +2054,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); +int f2fs_inline_data_fiemap(struct inode *, + struct fiemap_extent_info *, __u64, __u64); /* * shrinker.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8120f8685141..a197215ad52b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { + if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + i_size_read(inode)) { unsigned offset; offset = i_size_read(inode) & ~PAGE_CACHE_MASK; zero_user_segment(page, offset, PAGE_CACHE_SIZE); @@ -86,6 +87,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + /* if gced page is attached, don't write to cold segment */ clear_cold_data(page); out: @@ -343,7 +349,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { @@ -504,14 +510,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); return 0; } - page = get_lock_data_page(inode, index); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) return 0; truncate_out: @@ -680,6 +686,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -738,23 +745,31 @@ static int fill_zero(struct inode *inode, pgoff_t index, int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { - pgoff_t index; int err; - for (index = pg_start; index < pg_end; index++) { + while (pg_start < pg_end) { struct dnode_of_data dn; + pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { - if (err == -ENOENT) + if (err == -ENOENT) { + pg_start++; continue; + } return err; } - if (dn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&dn, 1); + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); + + pg_start += count; } return 0; } @@ -765,9 +780,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - if (f2fs_has_inline_data(inode)) { ret = f2fs_convert_inline_inode(inode); if (ret) @@ -805,8 +817,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi); - blk_start = pg_start << PAGE_CACHE_SHIFT; - blk_end = pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -819,86 +831,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int __exchange_data_block(struct inode *inode, pgoff_t src, + pgoff_t dst, bool full) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; - - for (; end < nrpages; start++, end++) { - block_t new_addr, old_addr; - - f2fs_lock_op(sbi); + block_t new_addr; + bool do_replace = false; + int ret; - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + new_addr = NULL_ADDR; + } else { + new_addr = dn.data_blkaddr; + if (!is_checkpointed_data(sbi, new_addr)) { + dn.data_blkaddr = NULL_ADDR; + /* do not invalidate this block address */ + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + do_replace = true; } + f2fs_put_dnode(&dn); + } - if (new_addr == NULL_ADDR) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - f2fs_unlock_op(sbi); - continue; - } + if (new_addr == NULL_ADDR) + return full ? truncate_hole(inode, dst, dst + 1) : 0; - if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - continue; - } else { - truncate_data_blocks_range(&dn, 1); - } + if (do_replace) { + struct page *ipage = get_node_page(sbi, inode->i_ino); + struct node_info ni; - f2fs_put_dnode(&dn); - } else { - struct page *ipage; + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto err_out; + } - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, dst); + if (ret) + goto err_out; - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, start); - if (ret) - goto out; + truncate_data_blocks_range(&dn, 1); - old_addr = dn.data_blkaddr; - if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) { - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - invalidate_blocks(sbi, old_addr); + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true); + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(inode, src, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(inode, NULL, dst, false); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); + f2fs_put_page(psrc, 1); - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - } else if (new_addr != NEW_ADDR) { - struct node_info ni; + return truncate_hole(inode, src, src + 1); + } + return 0; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } +err_out: + if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { + dn.data_blkaddr = new_addr; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + f2fs_put_dnode(&dn); + } + return ret; +} - f2fs_put_dnode(&dn); - } +static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + int ret = 0; + + for (; end < nrpages; start++, end++) { + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); + if (ret) + break; } - return 0; -out: - f2fs_unlock_op(sbi); return ret; } @@ -908,9 +934,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (offset + len >= i_size_read(inode)) return -EINVAL; @@ -940,7 +963,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + new_size = i_size_read(inode) - len; + truncate_pagecache(inode, new_size); ret = truncate_blocks(inode, new_size, true); if (!ret) @@ -959,9 +987,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; @@ -1003,7 +1028,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, return ret; new_size = max_t(loff_t, new_size, - pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_CACHE_SHIFT); } for (index = pg_start; index < pg_end; index++) { @@ -1039,7 +1064,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_unlock_op(sbi); new_size = max_t(loff_t, new_size, - (index + 1) << PAGE_CACHE_SHIFT); + (loff_t)(index + 1) << PAGE_CACHE_SHIFT); } if (off_end) { @@ -1066,10 +1091,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t pg_start, pg_end, delta, nrpages, idx; loff_t new_size; - int ret; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; + int ret = 0; new_size = i_size_read(inode) + len; if (new_size > inode->i_sb->s_maxbytes) @@ -1107,57 +1129,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { - struct dnode_of_data dn; - struct page *ipage; - block_t new_addr, old_addr; - f2fs_lock_op(sbi); - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - goto next; - } else if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - goto next; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); - } - - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, idx + delta); - if (ret) - goto out; - - old_addr = dn.data_blkaddr; - f2fs_bug_on(sbi, old_addr != NEW_ADDR); - - if (new_addr != NEW_ADDR) { - struct node_info ni; - - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } - f2fs_put_dnode(&dn); -next: + ret = __exchange_data_block(inode, idx, idx + delta, false); f2fs_unlock_op(sbi); + if (ret) + break; } - i_size_write(inode, new_size); - return 0; -out: - f2fs_unlock_op(sbi); + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + if (!ret) + i_size_write(inode, new_size); return ret; } @@ -1204,9 +1188,10 @@ noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) - new_size = (index + 1) << PAGE_CACHE_SHIFT; + new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; else if (index == pg_end) - new_size = (index << PAGE_CACHE_SHIFT) + off_end; + new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + + off_end; else new_size += PAGE_CACHE_SIZE; } @@ -1228,6 +1213,10 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (f2fs_encrypted_inode(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -1437,8 +1426,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (!f2fs_is_first_block_written(inode)) return truncate_partial_data_page(inode, 0, true); - punch_hole(inode, 0, F2FS_BLKSIZE); - return 0; + return punch_hole(inode, 0, F2FS_BLKSIZE); } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1455,13 +1443,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - commit_inmem_pages(inode, true); - } - - if (f2fs_is_volatile_file(inode)) - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + commit_inmem_pages(inode, true); mnt_drop_write_file(filp); return ret; @@ -1496,6 +1480,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi); break; + case F2FS_GOING_DOWN_METAFLUSH: + sync_meta_pages(sbi, META, LONG_MAX); + f2fs_stop_checkpoint(sbi); + break; default: return -EINVAL; } @@ -1616,27 +1604,44 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - __u32 i, count; + __u32 sync; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (get_user(count, (__u32 __user *)arg)) + if (get_user(sync, (__u32 __user *)arg)) return -EFAULT; - if (!count || count > F2FS_BATCH_GC_MAX_NUM) - return -EINVAL; + if (f2fs_readonly(sbi->sb)) + return -EROFS; - for (i = 0; i < count; i++) { + if (!sync) { if (!mutex_trylock(&sbi->gc_mutex)) - break; - - if (f2fs_gc(sbi)) - break; + return -EBUSY; + } else { + mutex_lock(&sbi->gc_mutex); } - if (put_user(i, (__u32 __user *)arg)) - return -EFAULT; + return f2fs_gc(sbi, sync); +} + +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct cp_control cpc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + cpc.reason = __get_cp_reason(sbi); + + mutex_lock(&sbi->gc_mutex); + write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); return 0; } @@ -1672,6 +1677,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp, arg); default: return -ENOTTY; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 782b8e72c094..fedbf67a0842 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,9 +78,12 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); @@ -257,6 +260,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; unsigned int secno, max_cost; + unsigned int last_segment = MAIN_SEGS(sbi); int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -267,6 +271,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = max_cost = get_max_cost(sbi, &p); + if (p.max_search == 0) + goto out; + if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -277,9 +284,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned long cost; unsigned int segno; - segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); - if (segno >= MAIN_SEGS(sbi)) { + segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + if (segno >= last_segment) { if (sbi->last_victim[p.gc_mode]) { + last_segment = sbi->last_victim[p.gc_mode]; sbi->last_victim[p.gc_mode] = 0; p.offset = 0; continue; @@ -327,6 +335,7 @@ got_it: sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); } +out: mutex_unlock(&dirty_i->seglist_lock); return (p.min_segno == NULL_SEGNO) ? 0 : 1; @@ -541,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) int err; /* do not read out */ - page = grab_cache_page(inode->i_mapping, bidx); + page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); if (!page) return; @@ -550,8 +559,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (err) goto out; - if (unlikely(dn.data_blkaddr == NULL_ADDR)) + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + ClearPageUptodate(page); goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); @@ -580,7 +597,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) goto put_page_out; set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, META); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); @@ -611,7 +628,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) { struct page *page; - page = get_lock_data_page(inode, bidx); + page = get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; @@ -705,7 +722,7 @@ next_step: start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA); + start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { iput(inode); continue; @@ -797,13 +814,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return nfree; } -int f2fs_gc(struct f2fs_sb_info *sbi) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { - unsigned int segno = NULL_SEGNO; - unsigned int i; - int gc_type = BG_GC; - int nfree = 0; - int ret = -1; + unsigned int segno, i; + int gc_type = sync ? FG_GC : BG_GC; + int sec_freed = 0; + int ret = -EINVAL; struct cp_control cpc; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), @@ -812,12 +828,14 @@ int f2fs_gc(struct f2fs_sb_info *sbi) cpc.reason = __get_cp_reason(sbi); gc_more: + segno = NULL_SEGNO; + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (unlikely(f2fs_cp_error(sbi))) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) write_checkpoint(sbi, &cpc); @@ -830,23 +848,38 @@ gc_more: /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA); + META_SSA, true); - for (i = 0; i < sbi->segs_per_sec; i++) - nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * for FG_GC case, halt gcing left segments once failed one + * of segments in selected section to avoid long latency. + */ + if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && + gc_type == FG_GC) + break; + } + + if (i == sbi->segs_per_sec && gc_type == FG_GC) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (has_not_enough_free_secs(sbi, nfree)) - goto gc_more; + if (!sync) { + if (has_not_enough_free_secs(sbi, sec_freed)) + goto gc_more; - if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + if (gc_type == FG_GC) + write_checkpoint(sbi, &cpc); + } stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); + + if (sync) + ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index c5a055b3376e..b4a65be9f7d3 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,12 +19,6 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ -/* - * with this macro, we can control the max time we do garbage collection, - * when user triggers batch mode gc by ioctl. - */ -#define F2FS_BATCH_GC_MAX_NUM 16 - /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3d143be42895..bda7126466c0 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include "f2fs.h" +#include "node.h" bool f2fs_may_inline_data(struct inode *inode) { @@ -274,12 +275,14 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(ipage, 0); + if (!truncate_inline_inode(ipage, 0)) + return false; f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - truncate_blocks(inode, 0, false); + if (truncate_blocks(inode, 0, false)) + return false; goto process_inline; } return false; @@ -568,3 +571,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, f2fs_put_page(ipage, 1); return 0; } + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct page *ipage; + int err = 0; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if (!f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); +out: + f2fs_put_page(ipage, 1); + return err; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 35aae65b3e5d..97e20decacb4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -296,16 +296,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; /* - * We need to lock here to prevent from producing dirty node pages + * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - f2fs_lock_op(sbi); update_inode_page(inode); - f2fs_unlock_op(sbi); - - if (wbc) - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi); return 0; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a680bf38e4f0..e48b80c49090 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -410,11 +410,14 @@ err_out: * If the symlink path is stored into inline_data, there is no * performance regression. */ - if (!err) + if (!err) { filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); - if (IS_DIRSYNC(dir)) - f2fs_sync_fs(sbi->sb, 1); + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } kfree(sd); f2fs_fname_crypto_free_buffer(&disk_link); @@ -947,8 +950,13 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; - cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); + cstr.name = kmalloc(cstr.len, GFP_NOFS); + if (!cstr.name) { + res = -ENOMEM; + goto errout; + } + memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ if (cstr.name[0] == 0 && cstr.len == 0) { @@ -970,6 +978,8 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook if (res < 0) goto errout; + kfree(cstr.name); + paddr = pstr.name; /* Null-terminate the name */ @@ -979,6 +989,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook page_cache_release(cpage); return *cookie = paddr; errout: + kfree(cstr.name); f2fs_fname_crypto_free_buffer(&pstr); kunmap(cpage); page_cache_release(cpage); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27d1a74dd6f3..7bcbc6e9c40d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1323,23 +1323,24 @@ static int f2fs_write_node_page(struct page *page, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); unlock_page(page); return 0; } - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - set_page_writeback(page); fio.blk_addr = ni.blk_addr; write_node_page(nid, &fio); @@ -1528,7 +1529,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1558,6 +1560,9 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + nm_i->ra_nid_pages, META_NAT, false); } /* @@ -1803,10 +1808,10 @@ int restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, bio_blocks); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR); + ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_meta_page(sbi, idx); + struct page *page = get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; @@ -2000,6 +2005,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7427e956ad81..e4fffd2d98c4 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -14,9 +14,11 @@ /* node block offset on the NAT area dedicated to the given start node id */ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) -/* # of pages to perform readahead before building free nids */ +/* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 4 +#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ + /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index faec2ca004b9..cbf74f47cce8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,7 +180,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR); + ra_meta_pages(sbi, blkaddr, 1, META_POR, true); while (1) { struct fsync_inode_entry *entry; @@ -188,7 +188,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -383,15 +383,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, start = start_bidx_of_node(ofs_of_node(page), fi); end = start + ADDRS_PER_PAGE(page, fi); - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) { - f2fs_unlock_op(sbi); + if (err) goto out; - } f2fs_wait_on_page_writeback(dn.node_page, NODE); @@ -456,7 +452,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); err: f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); out: f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, recovered = %d blocks, err = %d", @@ -485,7 +480,7 @@ static int recover_data(struct f2fs_sb_info *sbi, ra_meta_pages_cond(sbi, blkaddr); - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) { f2fs_put_page(page, 1); @@ -570,7 +565,7 @@ out: /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 78e6d0696847..f77b3258454a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -14,8 +14,8 @@ #include <linux/blkdev.h> #include <linux/prefetch.h> #include <linux/kthread.h> -#include <linux/vmalloc.h> #include <linux/swap.h> +#include <linux/timer.h> #include "f2fs.h" #include "segment.h" @@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. @@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word) int num = 0; #if BITS_PER_LONG == 64 - if ((word & 0xffffffff) == 0) { + if ((word & 0xffffffff00000000UL) == 0) num += 32; + else word >>= 32; - } #endif - if ((word & 0xffff) == 0) { + if ((word & 0xffff0000) == 0) num += 16; + else word >>= 16; - } - if ((word & 0xff) == 0) { + + if ((word & 0xff00) == 0) num += 8; + else word >>= 8; - } + if ((word & 0xf0) == 0) num += 4; else word >>= 4; + if ((word & 0xc) == 0) num += 2; else word >>= 2; + if ((word & 0x2) == 0) num += 1; return num; @@ -68,26 +87,16 @@ static inline unsigned long __reverse_ffs(unsigned long word) * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * Example: - * LSB <--> MSB - * f2fs_set_bit(0, bitmap) => 0000 0001 - * f2fs_set_bit(7, bitmap) => 1000 0000 + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (!f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; @@ -97,14 +106,9 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, if (!offset) goto aligned; - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~0UL << quot; - submask = (unsigned char)(0xff << rest) >> rest; - submask <<= quot; - mask &= submask; - tmp &= mask; + tmp = __reverse_ulong((unsigned char *)p); + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) goto found_first; if (tmp) @@ -112,42 +116,34 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, size -= BITS_PER_LONG; result += BITS_PER_LONG; + p++; aligned: while (size & ~(BITS_PER_LONG-1)) { - tmp = *(p++); + tmp = __reverse_ulong((unsigned char *)p); if (tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; + p++; } if (!size) return result; - tmp = *p; + + tmp = __reverse_ulong((unsigned char *)p); found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (!tmp) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __reverse_ffs(tmp); -#endif } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; @@ -157,40 +153,36 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, if (!offset) goto aligned; - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~(~0UL << quot); - submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); - submask <<= quot; - mask += submask; - tmp |= mask; + tmp = __reverse_ulong((unsigned char *)p); + tmp |= ~((~0UL << offset) >> offset); + if (size < BITS_PER_LONG) goto found_first; - if (~tmp) + if (tmp != ~0UL) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; + p++; aligned: while (size & ~(BITS_PER_LONG - 1)) { - tmp = *(p++); - if (~tmp) + tmp = __reverse_ulong((unsigned char *)p); + if (tmp != ~0UL) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; + p++; } if (!size) return result; - tmp = *p; + tmp = __reverse_ulong((unsigned char *)p); found_first: - tmp |= ~0UL << size; - if (tmp == ~0UL) /* Are any bits zero? */ + tmp |= ~(~0UL << (BITS_PER_LONG - size)); + if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + __reverse_ffz(tmp); -#endif } void register_inmem_page(struct inode *inode, struct page *page) @@ -257,11 +249,12 @@ int commit_inmem_pages(struct inode *inode, bool abort) trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; err = do_write_data_page(&fio); - submit_bio = true; if (err) { unlock_page(cur->page); break; } + clear_cold_data(cur->page); + submit_bio = true; } } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); @@ -296,7 +289,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) */ if (has_not_enough_free_secs(sbi, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi); + f2fs_gc(sbi, false); } } @@ -316,7 +309,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || - !available_free_memory(sbi, INO_ENTRIES)) + !available_free_memory(sbi, INO_ENTRIES) || + jiffies > sbi->cp_expires) f2fs_sync_fs(sbi->sb, true); } @@ -767,6 +761,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) mutex_unlock(&sit_i->sentry_lock); } +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return true; + + mutex_lock(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + mutex_unlock(&sit_i->sentry_lock); + + return is_cp; +} + /* * This function should be resided under the curseg_mutex lock */ @@ -1292,6 +1310,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .encrypted_page = NULL, }; + if (unlikely(page->index >= MAIN_BLKADDR(sbi))) + fio.rw &= ~REQ_META; + set_page_writeback(page); f2fs_submit_page_mbio(&fio); } @@ -1369,7 +1390,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + if (!recover_curseg) + update_sit_entry(sbi, new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); + locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { @@ -1449,6 +1477,23 @@ void f2fs_wait_on_page_writeback(struct page *page, } } +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + struct page *cpage; + + if (blkaddr == NEW_ADDR) + return; + + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_put_page(cpage, 1); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1586,7 +1631,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (npages >= 2) ra_meta_pages(sbi, start_sum_block(sbi), npages, - META_CP); + META_CP, true); /* restore for compacted data summary */ if (read_compacted_summaries(sbi)) @@ -1596,7 +1641,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (__exist_node_summaries(sbi)) ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP); + NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -1955,12 +2000,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -1982,8 +2028,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * - sizeof(struct sec_entry)); + sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } @@ -2028,12 +2074,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2082,7 +2128,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(sbi); do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -2174,7 +2220,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2196,7 +2242,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2301,7 +2347,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); - kfree(dirty_i->dirty_segmap[dirty_type]); + kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } @@ -2309,7 +2355,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - kfree(dirty_i->victim_secmap); + kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -2348,8 +2394,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) if (!free_i) return; SM_I(sbi)->free_info = NULL; - kfree(free_i->free_segmap); - kfree(free_i->free_secmap); + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); kfree(free_i); } @@ -2370,9 +2416,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) } kfree(sit_i->tmp_map); - vfree(sit_i->sentries); - vfree(sit_i->sec_entries); - kfree(sit_i->dirty_sentries_bitmap); + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b6e4ed15c698..ee44d346ea44 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -137,10 +137,12 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. + * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, - FG_GC + FG_GC, + FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f79478115d37..3a65e0132352 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -213,8 +213,10 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -231,6 +233,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(cp_interval), NULL, }; @@ -292,11 +296,16 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 2 && !strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) { set_opt(sbi, BG_GC); - else if (strlen(name) == 3 && !strncmp(name, "off", 3)) + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { clear_opt(sbi, BG_GC); - else { + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { + set_opt(sbi, BG_GC); + set_opt(sbi, FORCE_FG_GC); + } else { kfree(name); return -EINVAL; } @@ -631,10 +640,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) - seq_printf(seq, ",background_gc=%s", "on"); - else + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { + if (test_opt(sbi, FORCE_FG_GC)) + seq_printf(seq, ",background_gc=%s", "sync"); + else + seq_printf(seq, ",background_gc=%s", "on"); + } else { seq_printf(seq, ",background_gc=%s", "off"); + } if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) @@ -742,6 +755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; + bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); sync_filesystem(sb); @@ -767,6 +781,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + /* disallow enable/disable extent_cache dynamically */ + if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "switch extent_cache option is not allowed"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount @@ -996,6 +1018,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; + sbi->cp_interval = DEF_CP_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1332,6 +1355,8 @@ try_onemore: f2fs_commit_super(sbi, true); } + sbi->cp_expires = round_jiffies_up(jiffies); + return 0; free_kobj: diff --git a/fs/file.c b/fs/file.c index 6c672ad329e9..39f8f15921da 100644 --- a/fs/file.c +++ b/fs/file.c @@ -56,9 +56,35 @@ static void free_fdtable_rcu(struct rcu_head *rcu) __free_fdtable(container_of(rcu, struct fdtable, rcu)); } +#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) +#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) + +/* + * Copy 'count' fd bits from the old table to the new table and clear the extra + * space if any. This does not copy the file pointers. Called with the files + * spinlock held for write. + */ +static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, + unsigned int count) +{ + unsigned int cpy, set; + + cpy = count / BITS_PER_BYTE; + set = (nfdt->max_fds - count) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)nfdt->open_fds + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)nfdt->close_on_exec + cpy, 0, set); + + cpy = BITBIT_SIZE(count); + set = BITBIT_SIZE(nfdt->max_fds) - cpy; + memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); + memset((char *)nfdt->full_fds_bits + cpy, 0, set); +} + /* - * Expand the fdset in the files_struct. Called with the files spinlock - * held for write. + * Copy all file descriptors from the old table to the new, expanded table and + * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { @@ -69,14 +95,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); - memset((char *)(nfdt->fd) + cpy, 0, set); + memset((char *)nfdt->fd + cpy, 0, set); - cpy = ofdt->max_fds / BITS_PER_BYTE; - set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; - memcpy(nfdt->open_fds, ofdt->open_fds, cpy); - memset((char *)(nfdt->open_fds) + cpy, 0, set); - memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); - memset((char *)(nfdt->close_on_exec) + cpy, 0, set); + copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } static struct fdtable * alloc_fdtable(unsigned int nr) @@ -115,12 +136,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->fd = data; data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; + data += nr / BITS_PER_BYTE; + fdt->full_fds_bits = data; return fdt; @@ -226,17 +249,22 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt) static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) { - __clear_bit(fd, fdt->close_on_exec); + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); } -static inline void __set_open_fd(int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); + fd /= BITS_PER_LONG; + if (!~fdt->open_fds[fd]) + __set_bit(fd, fdt->full_fds_bits); } -static inline void __clear_open_fd(int fd, struct fdtable *fdt) +static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); + __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static int count_open_files(struct fdtable *fdt) @@ -262,7 +290,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i; + int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; @@ -280,6 +308,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; + new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); @@ -318,12 +347,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) open_files = count_open_files(old_fdt); } + copy_fd_bitmaps(new_fdt, old_fdt, open_files); + old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); - memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); - for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { @@ -341,19 +369,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) } spin_unlock(&oldf->file_lock); - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds - open_files) / 8; - int start = open_files / BITS_PER_LONG; - - memset(&new_fdt->open_fds[start], 0, left); - memset(&new_fdt->close_on_exec[start], 0, left); - } + /* clear the remainder */ + memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); @@ -454,10 +471,25 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, + .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), }; +static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start) +{ + unsigned long maxfd = fdt->max_fds; + unsigned long maxbit = maxfd / BITS_PER_LONG; + unsigned long bitbit = start / BITS_PER_LONG; + + bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; + if (bitbit > maxfd) + return maxfd; + if (bitbit > start) + start = bitbit; + return find_next_zero_bit(fdt->open_fds, maxfd, start); +} + /* * allocate a file descriptor, mark it busy. */ @@ -476,7 +508,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); + fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 091a36444972..206a68b1db1a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { - int next_memcg_id = 0; - struct bdi_writeback *wb; - struct wb_iter iter; + struct bdi_writeback *last_wb = NULL; + struct bdi_writeback *wb = list_entry(&bdi->wb_list, + struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); - bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { + list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; + if (last_wb) { + wb_put(last_wb); + last_wb = NULL; + } + /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || @@ -819,12 +824,22 @@ restart: wb_queue_work(wb, work); - next_memcg_id = wb->memcg_css->id + 1; + /* + * Pin @wb so that it stays on @bdi->wb_list. This allows + * continuing iteration from @wb after dropping and + * regrabbing rcu read lock. + */ + wb_get(wb); + last_wb = wb; + rcu_read_unlock(); wb_wait_for_completion(bdi, &fallback_work_done); goto restart; } rcu_read_unlock(); + + if (last_wb) + wb_put(last_wb); } #else /* CONFIG_CGROUP_WRITEBACK */ @@ -1857,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; - struct wb_iter iter; if (!bdi_has_dirty_io(bdi)) continue; - bdi_for_each_wb(wb, bdi, &iter, 0) + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), false, reason); } @@ -1894,11 +1908,10 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; - struct wb_iter iter; - bdi_for_each_wb(wb, bdi, &iter, 0) - if (!list_empty(&bdi->wb.b_dirty_time)) - wb_wakeup(&bdi->wb); + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) + if (!list_empty(&wb->b_dirty_time)) + wb_wakeup(wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); @@ -2136,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); old_inode = inode; - filemap_fdatawait(mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(mapping); cond_resched(); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 51dde817e1f2..6b028b7c4250 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -316,7 +316,7 @@ static const struct seq_operations fscache_objlist_ops = { static void fscache_objlist_config(struct fscache_objlist_data *data) { #ifdef CONFIG_KEYS - struct user_key_payload *confkey; + const struct user_key_payload *confkey; unsigned long config; struct key *key; const char *buf; @@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) config = 0; rcu_read_lock(); - confkey = key->payload.data; + confkey = user_key_payload(key); buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f523f2f04c19..e0faf8f2c868 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2189,7 +2189,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) int err; if (fc->no_flock) { - err = flock_lock_file_wait(file, fl); + err = locks_lock_file_wait(file, fl); } else { struct fuse_file *ff = file->private_data; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index cf4ab89159f4..9287a2d17b8c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1000,7 +1000,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) } if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { if (fl->fl_type == F_UNLCK) - posix_lock_file_wait(file, fl); + locks_lock_file_wait(file, fl); return -EIO; } if (IS_GETLK(cmd)) @@ -1031,7 +1031,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (gl) { if (fl_gh->gh_state == state) goto out; - flock_lock_file_wait(file, + locks_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); @@ -1056,7 +1056,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (error == GLR_TRYFAILED) error = -EAGAIN; } else { - error = flock_lock_file_wait(file, fl); + error = locks_lock_file_wait(file, fl); gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); } @@ -1071,7 +1071,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) struct gfs2_holder *fl_gh = &fp->f_fl_gh; mutex_lock(&fp->f_fl_mutex); - flock_lock_file_wait(file, fl); + locks_lock_file_wait(file, fl); if (fl_gh->gh_gl) { gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 81180022923f..d211b8e18566 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode uint32_t alloclen; int ret; - if (!new_valid_dev(rdev)) - return -EINVAL; - ri = jffs2_alloc_raw_inode(); if (!ri) return -ENOMEM; diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index b8fd651307a4..ce1189793288 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void) void jffs2_destroy_slab_caches(void) { - if(full_dnode_slab) - kmem_cache_destroy(full_dnode_slab); - if(raw_dirent_slab) - kmem_cache_destroy(raw_dirent_slab); - if(raw_inode_slab) - kmem_cache_destroy(raw_inode_slab); - if(tmp_dnode_info_slab) - kmem_cache_destroy(tmp_dnode_info_slab); - if(raw_node_ref_slab) - kmem_cache_destroy(raw_node_ref_slab); - if(node_frag_slab) - kmem_cache_destroy(node_frag_slab); - if(inode_cache_slab) - kmem_cache_destroy(inode_cache_slab); + kmem_cache_destroy(full_dnode_slab); + kmem_cache_destroy(raw_dirent_slab); + kmem_cache_destroy(raw_inode_slab); + kmem_cache_destroy(tmp_dnode_info_slab); + kmem_cache_destroy(raw_node_ref_slab); + kmem_cache_destroy(node_frag_slab); + kmem_cache_destroy(inode_cache_slab); #ifdef CONFIG_JFFS2_FS_XATTR - if (xattr_datum_cache) - kmem_cache_destroy(xattr_datum_cache); - if (xattr_ref_cache) - kmem_cache_destroy(xattr_ref_cache); + kmem_cache_destroy(xattr_datum_cache); + kmem_cache_destroy(xattr_ref_cache); #endif } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 28e0aab42bc3..bfebbf13698c 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r err = jffs2_flash_read(c, (ref_offset(ref)) + read, rd->nsize - already, &read, &fd->name[already]); - if (unlikely(read != rd->nsize - already) && likely(!err)) + if (unlikely(read != rd->nsize - already) && likely(!err)) { + jffs2_free_full_dirent(fd); + JFFS2_ERROR("short read: wanted %d bytes, got %zd\n", + rd->nsize - already, read); return -EIO; + } if (unlikely(err)) { JFFS2_ERROR("read remainder of name: error %d\n", err); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 09ed55190ee2..63f31c0733c5 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { #ifdef CONFIG_JFFS2_FS_WBUF_VERIFY c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); if (!c->wbuf_verify) { - kfree(c->oobbuf); kfree(c->wbuf); return -ENOMEM; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index acd394716349..112952037933 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -474,18 +474,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho static int do_vfs_lock(struct file_lock *fl) { - int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { - case FL_POSIX: - res = posix_lock_file_wait(fl->fl_file, fl); - break; - case FL_FLOCK: - res = flock_lock_file_wait(fl->fl_file, fl); - break; - default: - BUG(); - } - return res; + return locks_lock_file_wait(fl->fl_file, fl); } /* diff --git a/fs/locks.c b/fs/locks.c index 2a54c800a223..0d2b3267e2a3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) { - struct file_lock_context *new; + struct file_lock_context *ctx; - if (likely(inode->i_flctx) || type == F_UNLCK) + /* paired with cmpxchg() below */ + ctx = smp_load_acquire(&inode->i_flctx); + if (likely(ctx) || type == F_UNLCK) goto out; - new = kmem_cache_alloc(flctx_cache, GFP_KERNEL); - if (!new) + ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL); + if (!ctx) goto out; - spin_lock_init(&new->flc_lock); - INIT_LIST_HEAD(&new->flc_flock); - INIT_LIST_HEAD(&new->flc_posix); - INIT_LIST_HEAD(&new->flc_lease); + spin_lock_init(&ctx->flc_lock); + INIT_LIST_HEAD(&ctx->flc_flock); + INIT_LIST_HEAD(&ctx->flc_posix); + INIT_LIST_HEAD(&ctx->flc_lease); /* * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - if (cmpxchg(&inode->i_flctx, NULL, new)) - kmem_cache_free(flctx_cache, new); + if (cmpxchg(&inode->i_flctx, NULL, ctx)) { + kmem_cache_free(flctx_cache, ctx); + ctx = smp_load_acquire(&inode->i_flctx); + } out: - return inode->i_flctx; + return ctx; } void @@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock_context *ctx; struct inode *inode = file_inode(filp); - ctx = inode->i_flctx; + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; return; @@ -1167,10 +1171,9 @@ EXPORT_SYMBOL(posix_lock_file); * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * - * Variant of posix_lock_file_wait that does not take a filp, and so can be - * used after the filp has already been torn down. + * Apply a POSIX style lock request to an inode. */ -int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) +static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); @@ -1187,7 +1190,6 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(posix_lock_inode_wait); /** * locks_mandatory_locked - Check for an active lock @@ -1203,7 +1205,7 @@ int locks_mandatory_locked(struct file *file) struct file_lock_context *ctx; struct file_lock *fl; - ctx = inode->i_flctx; + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) return 0; @@ -1388,7 +1390,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker) int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx; struct file_lock *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; @@ -1400,6 +1402,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) new_fl->fl_flags = type; /* typically we will check that ctx is non-NULL before calling */ + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) { WARN_ON_ONCE(1); return error; @@ -1494,9 +1497,10 @@ EXPORT_SYMBOL(__break_lease); void lease_get_mtime(struct inode *inode, struct timespec *time) { bool has_lease = false; - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx; struct file_lock *fl; + ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); if (!list_empty(&ctx->flc_lease)) { @@ -1543,10 +1547,11 @@ int fcntl_getlease(struct file *filp) { struct file_lock *fl; struct inode *inode = file_inode(filp); - struct file_lock_context *ctx = inode->i_flctx; + struct file_lock_context *ctx; int type = F_UNLCK; LIST_HEAD(dispose); + ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); time_out_leases(file_inode(filp), &dispose); @@ -1711,11 +1716,11 @@ static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; struct file_lock *fl, *victim = NULL; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct file_lock_context *ctx = inode->i_flctx; + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx; LIST_HEAD(dispose); + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) { trace_generic_delete_lease(inode, NULL); return error; @@ -1751,8 +1756,7 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); int error; if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE)) @@ -1856,7 +1860,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) * * Apply a FLOCK style lock request to an inode. */ -int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) +static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); @@ -1873,7 +1877,30 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(flock_lock_inode_wait); + +/** + * locks_lock_inode_wait - Apply a lock to an inode + * @inode: inode of the file to apply to + * @fl: The lock to be applied + * + * Apply a POSIX or FLOCK style lock request to an inode. + */ +int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) +{ + int res = 0; + switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { + case FL_POSIX: + res = posix_lock_inode_wait(inode, fl); + break; + case FL_FLOCK: + res = flock_lock_inode_wait(inode, fl); + break; + default: + BUG(); + } + return res; +} +EXPORT_SYMBOL(locks_lock_inode_wait); /** * sys_flock: - flock() system call. @@ -1931,7 +1958,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) (can_sleep) ? F_SETLKW : F_SETLK, lock); else - error = flock_lock_file_wait(f.file, lock); + error = locks_lock_file_wait(f.file, lock); out_free: locks_free_lock(lock); @@ -2107,7 +2134,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } -/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */ static int check_fmode_for_setlk(struct file_lock *fl) { @@ -2359,13 +2386,14 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { struct file_lock lock; - struct file_lock_context *ctx = file_inode(filp)->i_flctx; + struct file_lock_context *ctx; /* * If there are no locks held on this file, we don't need to call * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ + ctx = smp_load_acquire(&file_inode(filp)->i_flctx); if (!ctx || list_empty(&ctx->flc_posix)) return; @@ -2389,7 +2417,7 @@ EXPORT_SYMBOL(locks_remove_posix); /* The i_flctx must be valid when calling into here */ static void -locks_remove_flock(struct file *filp) +locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { struct file_lock fl = { .fl_owner = filp, @@ -2400,7 +2428,6 @@ locks_remove_flock(struct file *filp) .fl_end = OFFSET_MAX, }; struct inode *inode = file_inode(filp); - struct file_lock_context *flctx = inode->i_flctx; if (list_empty(&flctx->flc_flock)) return; @@ -2416,10 +2443,8 @@ locks_remove_flock(struct file *filp) /* The i_flctx must be valid when calling into here */ static void -locks_remove_lease(struct file *filp) +locks_remove_lease(struct file *filp, struct file_lock_context *ctx) { - struct inode *inode = file_inode(filp); - struct file_lock_context *ctx = inode->i_flctx; struct file_lock *fl, *tmp; LIST_HEAD(dispose); @@ -2439,17 +2464,20 @@ locks_remove_lease(struct file *filp) */ void locks_remove_file(struct file *filp) { - if (!file_inode(filp)->i_flctx) + struct file_lock_context *ctx; + + ctx = smp_load_acquire(&file_inode(filp)->i_flctx); + if (!ctx) return; /* remove any OFD locks */ locks_remove_posix(filp, filp); /* remove flock locks */ - locks_remove_flock(filp); + locks_remove_flock(filp, ctx); /* remove any leases */ - locks_remove_lease(filp); + locks_remove_lease(filp, ctx); } /** @@ -2616,7 +2644,7 @@ void show_fd_locks(struct seq_file *f, struct file_lock_context *ctx; int id = 0; - ctx = inode->i_flctx; + ctx = smp_load_acquire(&inode->i_flctx); if (!ctx) return; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a7fdbd868474..a709d80c8ebc 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); @@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); diff --git a/fs/mpage.c b/fs/mpage.c index 778a4ddef77a..09abba7653aa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -139,7 +139,8 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, struct buffer_head *map_bh, - unsigned long *first_logical_block, get_block_t get_block) + unsigned long *first_logical_block, get_block_t get_block, + gfp_t gfp) { struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; @@ -277,8 +278,7 @@ alloc_new: goto out; } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, nr_pages, BIO_MAX_PAGES), - GFP_KERNEL); + min_t(int, nr_pages, BIO_MAX_PAGES), gfp); if (bio == NULL) goto confused; } @@ -361,6 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; + gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping); map_bh.b_state = 0; map_bh.b_size = 0; @@ -370,12 +371,13 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, prefetchw(&page->flags); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) { + page->index, + gfp)) { bio = do_mpage_readpage(bio, page, nr_pages - page_idx, &last_block_in_bio, &map_bh, &first_logical_block, - get_block); + get_block, gfp); } page_cache_release(page); } @@ -395,11 +397,12 @@ int mpage_readpage(struct page *page, get_block_t get_block) sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; + gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping); map_bh.b_state = 0; map_bh.b_size = 0; bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, - &map_bh, &first_logical_block, get_block); + &map_bh, &first_logical_block, get_block, gfp); if (bio) mpage_bio_submit(READ, bio); return 0; @@ -482,6 +485,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -590,7 +594,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { @@ -617,7 +621,7 @@ alloc_new: wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); goto alloc_new; } @@ -627,7 +631,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -639,7 +643,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -695,8 +699,11 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } } blk_finish_plug(&plug); return ret; @@ -713,8 +720,11 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } return ret; } EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/namei.c b/fs/namei.c index 726d211db484..0d3340b32e14 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -955,26 +955,23 @@ static bool safe_hardlink_source(struct inode *inode) * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) - * - not CAP_FOWNER + * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * Returns 0 if successful, -ve on error. */ static int may_linkat(struct path *link) { - const struct cred *cred; struct inode *inode; if (!sysctl_protected_hardlinks) return 0; - cred = current_cred(); inode = link->dentry->d_inode; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || - capable(CAP_FOWNER)) + if (inode_owner_or_capable(inode) || safe_hardlink_source(inode)) return 0; audit_log_link_denied("linkat", link); @@ -1558,8 +1555,6 @@ static int lookup_fast(struct nameidata *nd, negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; - if (negative) - return -ENOENT; /* * This sequence count validates that the parent had no @@ -1580,6 +1575,12 @@ static int lookup_fast(struct nameidata *nd, goto unlazy; } } + /* + * Note: do negative dentry check after revalidation in + * case that drops it. + */ + if (negative) + return -ENOENT; path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 2714ef835bdd..be806ead7f4d 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -113,7 +113,8 @@ out: return status; } -static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid) +static int nfs_delegation_claim_opens(struct inode *inode, + const nfs4_stateid *stateid, fmode_t type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; @@ -140,7 +141,7 @@ again: /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); - err = nfs4_open_delegation_recall(ctx, state, stateid); + err = nfs4_open_delegation_recall(ctx, state, stateid, type); if (!err) err = nfs_delegation_claim_locks(ctx, state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) @@ -411,7 +412,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation do { if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) break; - err = nfs_delegation_claim_opens(inode, &delegation->stateid); + err = nfs_delegation_claim_opens(inode, &delegation->stateid, + delegation->type); if (!issync || err != -EAGAIN) break; /* diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index a44829173e57..333063e032f0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -54,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid); +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 38678d9a5cc4..4b1d08f56aba 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -166,8 +166,11 @@ nfs_direct_select_verf(struct nfs_direct_req *dreq, struct nfs_writeverf *verfp = &dreq->verf; #ifdef CONFIG_NFS_V4_1 - if (ds_clp) { - /* pNFS is in use, use the DS verf */ + /* + * pNFS is in use, use the DS verf except commit_through_mds is set + * for layout segment where nbuckets is zero. + */ + if (ds_clp && dreq->ds_cinfo.nbuckets > 0) { if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; else diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c0f9b1ed12b9..37f639d50af5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -738,18 +738,7 @@ out_noconflict: static int do_vfs_lock(struct file *file, struct file_lock *fl) { - int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { - case FL_POSIX: - res = posix_lock_file_wait(file, fl); - break; - case FL_FLOCK: - res = flock_lock_file_wait(file, fl); - break; - default: - BUG(); - } - return res; + return locks_lock_file_wait(file, fl); } static int diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index b34f2e228601..02ec07973bc4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -629,23 +629,18 @@ out_put: goto out; } -static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl) { int i; - for (i = 0; i < fl->num_fh; i++) { - if (!fl->fh_array[i]) - break; - kfree(fl->fh_array[i]); + if (fl->fh_array) { + for (i = 0; i < fl->num_fh; i++) { + if (!fl->fh_array[i]) + break; + kfree(fl->fh_array[i]); + } + kfree(fl->fh_array); } - kfree(fl->fh_array); - fl->fh_array = NULL; -} - -static void -_filelayout_free_lseg(struct nfs4_filelayout_segment *fl) -{ - filelayout_free_fh_array(fl); kfree(fl); } @@ -716,21 +711,21 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Do we want to use a mempool here? */ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags); if (!fl->fh_array[i]) - goto out_err_free; + goto out_err; p = xdr_inline_decode(&stream, 4); if (unlikely(!p)) - goto out_err_free; + goto out_err; fl->fh_array[i]->size = be32_to_cpup(p++); if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); - goto out_err_free; + goto out_err; } p = xdr_inline_decode(&stream, fl->fh_array[i]->size); if (unlikely(!p)) - goto out_err_free; + goto out_err; memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); dprintk("DEBUG: %s: fh len %d\n", __func__, fl->fh_array[i]->size); @@ -739,8 +734,6 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __free_page(scratch); return 0; -out_err_free: - filelayout_free_fh_array(fl); out_err: __free_page(scratch); return -EIO; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index d731bbf974aa..0f020e4d8421 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -175,10 +175,12 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; - int err; + loff_t err; do { err = _nfs42_proc_llseek(filep, offset, whence); + if (err >= 0) + break; if (err == -ENOTSUPP) return -EOPNOTSUPP; err = nfs4_handle_exception(server, err, &exception); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 2e4902203c35..5ba22c6b0ffa 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -297,7 +297,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, { const struct cred *saved_cred; struct key *rkey; - struct user_key_payload *payload; + const struct user_key_payload *payload; ssize_t ret; saved_cred = override_creds(id_resolver_cache); @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = rcu_dereference(rkey->payload.rcudata); + payload = user_key_payload(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 693b903b48bd..0e5ff69455c7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1127,6 +1127,21 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) return ret; } +static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, + fmode_t fmode) +{ + switch(fmode & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + return state->n_rdwr != 0; + case FMODE_WRITE: + return state->n_wronly != 0; + case FMODE_READ: + return state->n_rdonly != 0; + } + WARN_ON_ONCE(1); + return false; +} + static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) { int ret = 0; @@ -1443,12 +1458,18 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state) if (delegation) delegation_flags = delegation->flags; rcu_read_unlock(); - if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + switch (data->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: pr_err_ratelimited("NFS: Broken NFSv4 server %s is " "returning a delegation for " "OPEN(CLAIM_DELEGATE_CUR)\n", clp->cl_hostname); - } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) + return; + } + if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, data->owner->so_cred, &data->o_res); @@ -1571,17 +1592,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context return opendata; } -static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res) +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, + fmode_t fmode) { struct nfs4_state *newstate; int ret; - if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR || - opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) && - (opendata->o_arg.u.delegation_type & fmode) != fmode) - /* This mode can't have been delegated, so we must have - * a valid open_stateid to cover it - not need to reclaim. - */ + if (!nfs4_mode_match_open_stateid(opendata->state, fmode)) return 0; opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; @@ -1597,14 +1614,14 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod newstate = nfs4_opendata_to_nfs4_state(opendata); if (IS_ERR(newstate)) return PTR_ERR(newstate); + if (newstate != opendata->state) + ret = -ESTALE; nfs4_close_state(newstate, fmode); - *res = newstate; - return 0; + return ret; } static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) { - struct nfs4_state *newstate; int ret; /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */ @@ -1615,27 +1632,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * clear_bit(NFS_DELEGATED_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); smp_rmb(); - if (state->n_rdwr != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); - if (ret != 0) - return ret; - if (newstate != state) - return -ESTALE; - } - if (state->n_wronly != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); - if (ret != 0) - return ret; - if (newstate != state) - return -ESTALE; - } - if (state->n_rdonly != 0) { - ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); - if (ret != 0) - return ret; - if (newstate != state) - return -ESTALE; - } + ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); + if (ret != 0) + return ret; + ret = nfs4_open_recover_helper(opendata, FMODE_WRITE); + if (ret != 0) + return ret; + ret = nfs4_open_recover_helper(opendata, FMODE_READ); + if (ret != 0) + return ret; /* * We may have performed cached opens for all three recoveries. * Check if we need to update the current stateid. @@ -1759,18 +1764,35 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return err; } -int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +int nfs4_open_delegation_recall(struct nfs_open_context *ctx, + struct nfs4_state *state, const nfs4_stateid *stateid, + fmode_t type) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_opendata *opendata; - int err; + int err = 0; opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_DELEG_CUR_FH); if (IS_ERR(opendata)) return PTR_ERR(opendata); nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid); - err = nfs4_open_recover(opendata, state); + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); + clear_bit(NFS_DELEGATED_STATE, &state->flags); + switch (type & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ|FMODE_WRITE: + case FMODE_WRITE: + err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE); + if (err) + break; + err = nfs4_open_recover_helper(opendata, FMODE_WRITE); + if (err) + break; + case FMODE_READ: + err = nfs4_open_recover_helper(opendata, FMODE_READ); + } nfs4_opendata_put(opendata); return nfs4_handle_delegation_recall_error(server, state, stateid, err); } @@ -1850,6 +1872,8 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) data->rpc_done = 0; data->rpc_status = 0; data->timestamp = jiffies; + if (data->is_recover) + nfs4_set_sequence_privileged(&data->c_arg.seq_args); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -2645,6 +2669,15 @@ out: return err; } +static bool +nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) +{ + if (inode == NULL || !nfs_have_layout(inode)) + return false; + + return pnfs_wait_on_layoutreturn(inode, task); +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -2763,6 +2796,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } + if (nfs4_wait_on_layoutreturn(inode, task)) { + nfs_release_seqid(calldata->arg.seqid); + goto out_wait; + } + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; if (calldata->roc) @@ -5308,6 +5346,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; + if (nfs4_wait_on_layoutreturn(d_data->inode, task)) + return; + if (d_data->roc) pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); @@ -5472,18 +5513,7 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * static int do_vfs_lock(struct inode *inode, struct file_lock *fl) { - int res = 0; - switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { - case FL_POSIX: - res = posix_lock_inode_wait(inode, fl); - break; - case FL_FLOCK: - res = flock_lock_inode_wait(inode, fl); - break; - default: - BUG(); - } - return res; + return locks_lock_inode_wait(inode, fl); } struct nfs4_unlockdata { @@ -7800,39 +7830,46 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", __func__, delay); rpc_delay(task, delay); - task->tk_status = 0; - rpc_restart_call_prepare(task); - goto out; /* Do not call nfs4_async_handle_error() */ + /* Do not call nfs4_async_handle_error() */ + goto out_restart; } break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: spin_lock(&inode->i_lock); - lo = NFS_I(inode)->layout; - if (!lo || list_empty(&lo->plh_segs)) { + if (nfs4_stateid_match(&lgp->args.stateid, + &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); /* If the open stateid was bad, then recover it. */ state = lgp->args.ctx->state; - } else { + break; + } + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match(&lgp->args.stateid, + &lo->plh_stateid)) { LIST_HEAD(head); /* * Mark the bad layout state as invalid, then retry * with the current stateid. */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); - - task->tk_status = 0; - rpc_restart_call_prepare(task); - } + } else + spin_unlock(&inode->i_lock); + goto out_restart; } if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) - rpc_restart_call_prepare(task); + goto out_restart; out: dprintk("<-- %s\n", __func__); return; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); + return; out_overflow: task->tk_status = -EOVERFLOW; goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index da73bc443238..d854693a15b0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1481,7 +1481,7 @@ restart: spin_unlock(&state->state_lock); } nfs4_put_open_state(state); - clear_bit(NFS4CLNT_RECLAIM_NOGRACE, + clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); spin_lock(&sp->so_lock); goto restart; @@ -1725,7 +1725,8 @@ restart: if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags)) continue; - atomic_inc(&sp->so_count); + if (!atomic_inc_not_zero(&sp->so_count)) + continue; spin_unlock(&clp->cl_lock); rcu_read_unlock(); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 28df12e525ba..671cf68fe56b 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->flags = flags; __entry->fmode = (__force unsigned int)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; - if (!IS_ERR(state)) + if (!IS_ERR_OR_NULL(state)) inode = state->inode; if (inode != NULL) { __entry->fileid = NFS_FILEID(inode); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7c5718ba625e..fe3ddd20ff89 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -508,7 +508,7 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, * for it without upsetting the slab allocator. */ if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) * - sizeof(struct page) > PAGE_SIZE) + sizeof(struct page *) > PAGE_SIZE) return 0; return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ba1246433794..8abe27165ad0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1104,20 +1104,15 @@ bool pnfs_roc(struct inode *ino) mark_lseg_invalid(lseg, &tmp_list); found = true; } - /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put - * in pnfs_roc_release(). We don't really send a layoutreturn but - * still want others to view us like we are sending one! - * - * If pnfs_prepare_layoutreturn() fails, it means someone else is doing - * LAYOUTRETURN, so we proceed like there are no layouts to return. - * - * ROC in three conditions: + /* ROC in two conditions: * 1. there are ROC lsegs * 2. we don't send layoutreturn - * 3. no others are sending layoutreturn */ - if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo)) + if (found && !layoutreturn) { + /* lo ref dropped in pnfs_roc_release() */ + pnfs_get_layout_hdr(lo); roc = true; + } out_noroc: spin_unlock(&ino->i_lock); @@ -1172,6 +1167,26 @@ void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) spin_unlock(&ino->i_lock); } +bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + bool sleep = false; + + /* we might not have grabbed lo reference. so need to check under + * i_lock */ + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + sleep = true; + spin_unlock(&ino->i_lock); + + if (sleep) + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + + return sleep; +} + /* * Compare two layout segments for sorting into layout cache. * We want to preferentially return RW over RO layouts, so ensure those diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 78c9351ff117..d1990e90e7a0 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -270,6 +270,7 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); +bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); @@ -639,6 +640,12 @@ pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) { } +static inline bool +pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) +{ + return false; +} + static inline void set_pnfs_layoutdriver(struct nfs_server *s, const struct nfs_fh *mntfh, u32 id) { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index ae0ff7a11b40..01b8cc8e8cfc 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -72,6 +72,9 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; + if (pgio->pg_ops && pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + pgio->pg_ops = &nfs_pgio_rw_ops; /* read path should never have more than one mirror */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 388f48079c43..75ab7622e0cc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -569,19 +569,17 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); ret = pgio->pg_error; - } + } else + nfs_add_stats(page_file_mapping(page)->host, + NFSIOS_WRITEPAGES, 1); out: return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - struct inode *inode = page_file_mapping(page)->host; int ret; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { @@ -597,9 +595,11 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; + struct inode *inode = page_file_mapping(page)->host; int err; - nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc), + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); + nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); @@ -1223,7 +1223,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino return 1; if (!flctx || (list_empty_careful(&flctx->flc_flock) && list_empty_careful(&flctx->flc_posix))) - return 0; + return 1; /* Check to see if there are whole file write locks */ ret = 0; @@ -1351,6 +1351,9 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; + if (pgio->pg_ops && pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + pgio->pg_ops = &nfs_pgio_rw_ops; nfs_pageio_stop_mirroring(pgio); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index cdefaa331a07..c29d9421bd5e 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -56,14 +56,6 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, u32 device_generation = 0; int error; - /* - * We do not attempt to support I/O smaller than the fs block size, - * or not aligned to it. - */ - if (args->lg_minlength < block_size) { - dprintk("pnfsd: I/O too small\n"); - goto out_layoutunavailable; - } if (seg->offset & (block_size - 1)) { dprintk("pnfsd: I/O misaligned\n"); goto out_layoutunavailable; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6b6f0d472ae8..fd98e5100cab 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(mark->inode); if (inode) { + /* + * IN_ALL_EVENTS represents all of the mask bits + * that we expose to userspace. There is at + * least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ + u32 mask = mark->mask & IN_ALL_EVENTS; seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mark->mask, mark->ignored_mask); + mask, mark->ignored_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5b1e2a497e51..b8d08d0d0a4d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; - /* don't allow invalid bits: we don't want flags set */ + /* + * We share a lot of code with fs/dnotify. We also share + * the bit layout between inotify's IN_* and the fsnotify + * FS_*. This check ensures that only the inotify IN_* + * bits get passed in and set in watches/events. + */ + if (unlikely(mask & ~ALL_INOTIFY_BITS)) + return -EINVAL; + /* + * Require at least one valid bit set in the mask. + * Without _something_ set, we would have no events to + * watch for. + */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 64b11d90eca6..7f604727f487 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = -EIO; goto bail; } + set_buffer_new(bh_result); up_write(&OCFS2_I(inode)->ip_alloc_sem); } @@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, is_overwrite = ocfs2_is_overwrite(osb, inode, offset); if (is_overwrite < 0) { mlog_errno(is_overwrite); + ret = is_overwrite; ocfs2_inode_unlock(inode, 1); goto clean_orphan; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index fa15debcc02b..ddddef0021a0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -219,7 +219,8 @@ struct o2hb_region { unsigned hr_unclean_stop:1, hr_aborted_start:1, hr_item_pinned:1, - hr_item_dropped:1; + hr_item_dropped:1, + hr_node_deleted:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data) set_user_nice(current, MIN_NICE); /* Pin node */ - o2nm_depend_this_node(); + ret = o2nm_depend_this_node(); + if (ret) { + mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); + reg->hr_node_deleted = 1; + wake_up(&o2hb_steady_queue); + return 0; + } while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { @@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, spin_unlock(&o2hb_live_lock); ret = wait_event_interruptible(o2hb_steady_queue, - atomic_read(®->hr_steady_iterations) == 0); + atomic_read(®->hr_steady_iterations) == 0 || + reg->hr_node_deleted); if (ret) { atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; @@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out3; } + if (reg->hr_node_deleted) { + ret = -EINVAL; + goto out3; + } + /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6918f30d02cd..2ee7fe747cea 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) int status; unsigned int backoff; unsigned int total_backoff = 0; + char wq_name[O2NM_MAX_NAME_LEN]; BUG_ON(!dlm); @@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); + dlm->dlm_worker = create_singlethread_workqueue(wq_name); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 46b8b2bbc95a..ce38b4ccc9ab 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1439,6 +1439,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, int found, ret; int set_maybe; int dispatch_assert = 0; + int dispatched = 0; if (!dlm_grab(dlm)) return DLM_MASTER_RESP_NO; @@ -1657,16 +1658,20 @@ send_response: if (ret < 0) { mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; + spin_unlock(&res->spinlock); dlm_lockres_put(res); - } else + } else { + dispatched = 1; __dlm_lockres_grab_inflight_worker(dlm, res); - spin_unlock(&res->spinlock); + spin_unlock(&res->spinlock); + } } else { if (res) dlm_lockres_put(res); } - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return response; } @@ -2090,7 +2095,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, /* queue up work for dlm_assert_master_worker */ - dlm_grab(dlm); /* get an extra ref for the work item */ dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); item->u.am.lockres = res; /* already have a ref */ /* can optionally ignore node numbers higher than this node */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ce12e0b1a31f..9e4f862d20fe 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) mlog(0, "starting dlm recovery thread...\n"); dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, - "dlm_reco_thread"); + "dlm_reco-%s", dlm->name); if (IS_ERR(dlm->dlm_reco_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); dlm->dlm_reco_thread_task = NULL; @@ -1694,6 +1694,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; + int dispatched = 0; if (!dlm_grab(dlm)) { /* since the domain has gone away on this @@ -1719,9 +1720,11 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, dlm_put(dlm); /* sender will take care of this and retry */ return ret; - } else + } else { + dispatched = 1; __dlm_lockres_grab_inflight_worker(dlm, res); - spin_unlock(&res->spinlock); + spin_unlock(&res->spinlock); + } } else { /* put.. incase we are not the master */ spin_unlock(&res->spinlock); @@ -1730,7 +1733,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, } spin_unlock(&dlm->spinlock); - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return master; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 2e5e6d5fffe8..c5f6c241ecd7 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) { mlog(0, "Starting dlm_thread...\n"); - dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s", + dlm->name); if (IS_ERR(dlm->dlm_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_thread_task)); dlm->dlm_thread_task = NULL; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c91103c1333..20276e340339 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) } /* launch downconvert thread */ - osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", + osb->uuid_str); if (IS_ERR(osb->dc_task)) { status = PTR_ERR(osb->dc_task); osb->dc_task = NULL; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..aac8b86f312e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -112,6 +112,8 @@ struct ocfs2_inode_info #define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ #define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 +/* Entry in orphan dir with 'dio-' prefix */ +#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff82b28462a6..13534f4fe5b5 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) /* Launch the commit thread */ if (!local) { osb->commit_task = kthread_run(ocfs2_commit_thread, osb, - "ocfs2cmt"); + "ocfs2cmt-%s", osb->uuid_str); if (IS_ERR(osb->commit_task)) { status = PTR_ERR(osb->commit_task); osb->commit_task = NULL; @@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) goto out; osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, - "ocfs2rec"); + "ocfs2rec-%s", osb->uuid_str); if (IS_ERR(osb->recovery_thread_task)) { mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); osb->recovery_thread_task = NULL; @@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv { struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; + enum ocfs2_orphan_reco_type orphan_reco_type; }; static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, @@ -2036,12 +2037,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (name_len == 2 && !strncmp("..", name, 2)) return 0; + /* do not include dio entry in case of orphan scan */ + if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && + (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN))) + return 0; + /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); if (IS_ERR(iter)) return 0; + if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN)) + OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY; + /* Skip inodes which are already added to recover list, since dio may * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { @@ -2060,14 +2071,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, static int ocfs2_queue_orphans(struct ocfs2_super *osb, int slot, - struct inode **head) + struct inode **head, + enum ocfs2_orphan_reco_type orphan_reco_type) { int status; struct inode *orphan_dir_inode = NULL; struct ocfs2_orphan_filldir_priv priv = { .ctx.actor = ocfs2_orphan_filldir, .osb = osb, - .head = *head + .head = *head, + .orphan_reco_type = orphan_reco_type }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, @@ -2170,7 +2183,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, trace_ocfs2_recover_orphans(slot); ocfs2_mark_recovering_orphan_dir(osb, slot); - ret = ocfs2_queue_orphans(osb, slot, &inode); + ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type); ocfs2_clear_recovering_orphan_dir(osb, slot); /* Error here should be noted, but we want to continue with as @@ -2186,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; - mutex_lock(&inode->i_mutex); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto next; - } - /* - * We need to take and drop the inode lock to - * force read inode from disk. - */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); - goto unlock_rw; - } + if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { + mutex_lock(&inode->i_mutex); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto unlock_mutex; + } + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto unlock_rw; + } - di = (struct ocfs2_dinode *)di_bh->b_data; + di = (struct ocfs2_dinode *)di_bh->b_data; - if (inode->i_nlink == 0) { + if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto unlock_inode; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, + di_bh, 0, 0); + if (ret) + mlog_errno(ret); + } +unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; +unlock_rw: + ocfs2_rw_unlock(inode, 1); +unlock_mutex: + mutex_unlock(&inode->i_mutex); + + /* clear dio flag in ocfs2_inode_info */ + oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; + } else { spin_lock(&oi->ip_lock); /* Set the proper information to get us going into * ocfs2_delete_inode. */ @@ -2212,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, spin_unlock(&oi->ip_lock); } - if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && - (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ret = ocfs2_truncate_file(inode, di_bh, - i_size_read(inode)); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto unlock_inode; - } - - ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); - if (ret) - mlog_errno(ret); - } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ -unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; -unlock_rw: - ocfs2_rw_unlock(inode, 1); -next: - mutex_unlock(&inode->i_mutex); iput(inode); inode = iter; } diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 6b6d092b0998..652ece4a9d9e 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -66,7 +66,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, * level. */ - flock_lock_file_wait(file, + locks_lock_file_wait(file, &(struct file_lock){.fl_type = F_UNLCK}); ocfs2_file_unlock(file); @@ -81,7 +81,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, goto out; } - ret = flock_lock_file_wait(file, fl); + ret = locks_lock_file_wait(file, fl); if (ret) ocfs2_file_unlock(file); @@ -98,7 +98,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->fp_mutex); ocfs2_file_unlock(file); - ret = flock_lock_file_wait(file, fl); + ret = locks_lock_file_wait(file, fl); mutex_unlock(&fp->fp_mutex); return ret; @@ -119,7 +119,7 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) - return flock_lock_file_wait(file, fl); + return locks_lock_file_wait(file, fl); if (fl->fl_type == F_UNLCK) return ocfs2_do_funlock(file, cmd, fl); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b7dfac226b1e..3b48ac25d8a7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) -#define OCFS2_DIO_ORPHAN_PREFIX "dio-" -#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); + if (status < 0) { + u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); + int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, + inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); + if (tmp) + mlog_errno(tmp); + } + + return status; } static int ocfs2_mkdir(struct inode *dir, diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e173329eb830..1155918d6784 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -26,6 +26,9 @@ #ifndef OCFS2_NAMEI_H #define OCFS2_NAMEI_H +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 + extern const struct inode_operations ocfs2_dir_iops; struct dentry *ocfs2_get_parent(struct dentry *child); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e5d57cd32505..252119860e6c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to, readahead_pages; + unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); - readahead_pages = - (ocfs2_cow_contig_clusters(sb) << - OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d83d2602cf2b..fc6d25f6d444 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { - hint = ocfs2_group_from_res(res); + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) + hint = res->sr_bg_blkno; + else + hint = ocfs2_group_from_res(res); goto set_hint; } if (status < 0 && status != -ENOSPC) { diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 84d693d37428..871fcb67be97 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -81,11 +81,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) if (len == 0) return 0; - old_file = ovl_path_open(old, O_RDONLY); + old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY); if (IS_ERR(old_file)) return PTR_ERR(old_file); - new_file = ovl_path_open(new, O_WRONLY); + new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY); if (IS_ERR(new_file)) { error = PTR_ERR(new_file); goto out_fput; @@ -267,7 +267,7 @@ out: out_cleanup: ovl_cleanup(wdir, newdentry); - goto out; + goto out2; } /* diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index d9da5a4e9382..ec0c2a050043 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -363,6 +363,9 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) ovl_path_upper(dentry, &realpath); } + if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE) + return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags); + return d_backing_inode(realpath.dentry); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 79073d68b475..e38ee0fed24a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -544,6 +544,7 @@ static void ovl_put_super(struct super_block *sb) mntput(ufs->upper_mnt); for (i = 0; i < ufs->numlower; i++) mntput(ufs->lower_mnt[i]); + kfree(ufs->lower_mnt); kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); @@ -1048,6 +1049,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) oe->lowerstack[i].dentry = stack[i].dentry; oe->lowerstack[i].mnt = ufs->lower_mnt[i]; } + kfree(stack); root_dentry->d_fsdata = oe; diff --git a/fs/proc/array.c b/fs/proc/array.c index f60f0121e331..eed2050db9be 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -375,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { - unsigned long vsize, eip, esp, wchan = ~0UL; + unsigned long vsize, eip, esp, wchan = 0; int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; @@ -507,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); - seq_put_decimal_ull(m, ' ', wchan); + + /* + * We used to output the absolute kernel address, but that's an + * information leak - so instead we show a 0/1 flag here, to signal + * to user-space whether there's a wchan field in /proc/PID/wchan. + * + * This works with older implementations of procps as well. + */ + if (wchan) + seq_puts(m, " 1"); + else + seq_puts(m, " 0"); + seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ll(m, ' ', task->exit_signal); diff --git a/fs/proc/base.c b/fs/proc/base.c index b25eee4cead5..bd3e9e68125b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (lookup_symbol_name(wchan, symname) < 0) { - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return 0; - seq_printf(m, "%lu", wchan); - } else { + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); - } + else + seq_putc(m, '0'); return 0; } @@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, return simple_read_from_buffer(buf, count, ppos, buffer, len); } +/* + * /proc/pid/oom_adj exists solely for backwards compatibility with previous + * kernels. The effective policy is defined by oom_score_adj, which has a + * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. + * Values written to oom_adj are simply mapped linearly to oom_score_adj. + * Processes that become oom disabled via oom_adj will still be oom disabled + * with this implementation. + * + * oom_adj cannot be removed since existing userspace binaries use it. + */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d3ebf2e61853..9155a5a0d3b9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; - struct vmalloc_info vmi; long cached; long available; unsigned long pagecache; @@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) if (cached < 0) cached = 0; - get_vmalloc_info(&vmi); - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); @@ -191,8 +188,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, - vmi.used >> 10, - vmi.largest_chunk >> 10 + 0ul, // used to be vmalloc 'used' + 0ul // used to be vmalloc 'largest_chunk' #ifdef CONFIG_MEMORY_FAILURE , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e2d46adb54b4..187b3b5f242e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) ptes >> 10, pmds >> 10, swap << (PAGE_SHIFT-10)); + hugetlb_report_usage(m, mm); } unsigned long task_vsize(struct mm_struct *mm) @@ -446,6 +447,8 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; u64 pss; u64 swap_pss; }; @@ -625,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) seq_putc(m, '\n'); } +#ifdef CONFIG_HUGETLB_PAGE +static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page = NULL; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } + if (page) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) + mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } + return 0; +} +#endif /* HUGETLB_PAGE */ + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; struct mem_size_stats mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, +#ifdef CONFIG_HUGETLB_PAGE + .hugetlb_entry = smaps_hugetlb_range, +#endif .mm = vma->vm_mm, .private = &mss, }; @@ -652,6 +687,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" @@ -667,6 +704,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shared_hugetlb >> 10, + mss.private_hugetlb >> 10, mss.swap >> 10, (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, @@ -753,36 +792,37 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { + ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } - - set_pte_at(vma->vm_mm, addr, pte, ptent); } +#else +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} +#endif +#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); + pmd = pmd_clear_soft_dirty(pmd); if (vma->vm_flags & VM_SOFTDIRTY) vma->vm_flags &= ~VM_SOFTDIRTY; set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } - #else - -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} - static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 916b8e23d968..360ae43f590c 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,5 +1,5 @@ config PSTORE - bool "Persistent store support" + tristate "Persistent store support" default n select ZLIB_DEFLATE select ZLIB_INFLATE diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index e647d8e81712..b8803cc07fce 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -2,12 +2,12 @@ # Makefile for the linux pstorefs routines. # -obj-y += pstore.o +obj-$(CONFIG_PSTORE) += pstore.o pstore-objs += inode.o platform.o -obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o +pstore-$(CONFIG_PSTORE_FTRACE) += ftrace.o -obj-$(CONFIG_PSTORE_PMSG) += pmsg.o +pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 76a4eeb92982..d4887705bb61 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -104,22 +104,23 @@ static const struct file_operations pstore_knob_fops = { .write = pstore_ftrace_knob_write, }; +static struct dentry *pstore_ftrace_dir; + void pstore_register_ftrace(void) { - struct dentry *dir; struct dentry *file; if (!psinfo->write_buf) return; - dir = debugfs_create_dir("pstore", NULL); - if (!dir) { + pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); + if (!pstore_ftrace_dir) { pr_err("%s: unable to create pstore directory\n", __func__); return; } - file = debugfs_create_file("record_ftrace", 0600, dir, NULL, - &pstore_knob_fops); + file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir, + NULL, &pstore_knob_fops); if (!file) { pr_err("%s: unable to create record_ftrace file\n", __func__); goto err_file; @@ -127,5 +128,17 @@ void pstore_register_ftrace(void) return; err_file: - debugfs_remove(dir); + debugfs_remove(pstore_ftrace_dir); +} + +void pstore_unregister_ftrace(void) +{ + mutex_lock(&pstore_ftrace_lock); + if (pstore_ftrace_enabled) { + unregister_ftrace_function(&pstore_ftrace_ops); + pstore_ftrace_enabled = 0; + } + mutex_unlock(&pstore_ftrace_lock); + + debugfs_remove_recursive(pstore_ftrace_dir); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 3adcc4669fac..d8c439d813ce 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,6 +178,7 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) } static const struct file_operations pstore_file_operations = { + .owner = THIS_MODULE, .open = pstore_file_open, .read = pstore_file_read, .llseek = pstore_file_llseek, @@ -287,7 +288,7 @@ static const struct super_operations pstore_ops = { static struct super_block *pstore_sb; -int pstore_is_mounted(void) +bool pstore_is_mounted(void) { return pstore_sb != NULL; } @@ -456,6 +457,7 @@ static void pstore_kill_sb(struct super_block *sb) } static struct file_system_type pstore_fs_type = { + .owner = THIS_MODULE, .name = "pstore", .mount = pstore_mount, .kill_sb = pstore_kill_sb, @@ -479,5 +481,12 @@ out: } module_init(init_pstore_fs) +static void __exit exit_pstore_fs(void) +{ + unregister_filesystem(&pstore_fs_type); + sysfs_remove_mount_point(fs_kobj, "pstore"); +} +module_exit(exit_pstore_fs) + MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index c36ba2cd0b5d..e38a22b31282 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -41,14 +41,18 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); +extern void pstore_unregister_ftrace(void); #else static inline void pstore_register_ftrace(void) {} +static inline void pstore_unregister_ftrace(void) {} #endif #ifdef CONFIG_PSTORE_PMSG extern void pstore_register_pmsg(void); +extern void pstore_unregister_pmsg(void); #else static inline void pstore_register_pmsg(void) {} +static inline void pstore_unregister_pmsg(void) {} #endif extern struct pstore_info *psinfo; @@ -59,6 +63,6 @@ extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, int count, char *data, bool compressed, size_t size, struct timespec time, struct pstore_info *psi); -extern int pstore_is_mounted(void); +extern bool pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 791743deedf1..588461bb2dd4 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -237,6 +237,14 @@ static void allocate_buf_for_compression(void) } +static void free_buf_for_compression(void) +{ + kfree(stream.workspace); + stream.workspace = NULL; + kfree(big_oops_buf); + big_oops_buf = NULL; +} + /* * Called when compression fails, since the printk buffer * would be fetched for compression calling it again when @@ -353,6 +361,19 @@ static struct kmsg_dumper pstore_dumper = { .dump = pstore_dump, }; +/* + * Register with kmsg_dump to save last part of console log on panic. + */ +static void pstore_register_kmsg(void) +{ + kmsg_dump_register(&pstore_dumper); +} + +static void pstore_unregister_kmsg(void) +{ + kmsg_dump_unregister(&pstore_dumper); +} + #ifdef CONFIG_PSTORE_CONSOLE static void pstore_console_write(struct console *con, const char *s, unsigned c) { @@ -390,8 +411,14 @@ static void pstore_register_console(void) { register_console(&pstore_console); } + +static void pstore_unregister_console(void) +{ + unregister_console(&pstore_console); +} #else static void pstore_register_console(void) {} +static void pstore_unregister_console(void) {} #endif static int pstore_write_compat(enum pstore_type_id type, @@ -410,8 +437,6 @@ static int pstore_write_compat(enum pstore_type_id type, * read function right away to populate the file system. If not * then the pstore mount code will call us later to fill out * the file system. - * - * Register with kmsg_dump to save last part of console log on panic. */ int pstore_register(struct pstore_info *psi) { @@ -442,7 +467,7 @@ int pstore_register(struct pstore_info *psi) if (pstore_is_mounted()) pstore_get_records(0); - kmsg_dump_register(&pstore_dumper); + pstore_register_kmsg(); if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { pstore_register_console(); @@ -462,12 +487,28 @@ int pstore_register(struct pstore_info *psi) */ backend = psi->name; + module_put(owner); + pr_info("Registered %s as persistent store backend\n", psi->name); return 0; } EXPORT_SYMBOL_GPL(pstore_register); +void pstore_unregister(struct pstore_info *psi) +{ + pstore_unregister_pmsg(); + pstore_unregister_ftrace(); + pstore_unregister_console(); + pstore_unregister_kmsg(); + + free_buf_for_compression(); + + psinfo = NULL; + backend = NULL; +} +EXPORT_SYMBOL_GPL(pstore_unregister); + /* * Read all the records from the persistent store. Create * files in our filesystem. Don't warn about -EEXIST errors diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index feb5dd2948b4..7de20cd3797f 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -37,6 +37,8 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf, if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; buffer = vmalloc(buffer_size); + if (!buffer) + return -ENOMEM; mutex_lock(&pmsg_lock); for (i = 0; i < count; ) { @@ -112,3 +114,10 @@ err_class: err: return; } + +void pstore_unregister_pmsg(void) +{ + device_destroy(pmsg_class, MKDEV(pmsg_major, 0)); + class_destroy(pmsg_class); + unregister_chrdev(pmsg_major, PMSG_NAME); +} diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 6c26c4daaec9..319c3a60cfa5 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -578,30 +578,27 @@ fail_out: return err; } -static int __exit ramoops_remove(struct platform_device *pdev) +static int ramoops_remove(struct platform_device *pdev) { -#if 0 - /* TODO(kees): We cannot unload ramoops since pstore doesn't support - * unregistering yet. - */ struct ramoops_context *cxt = &oops_cxt; - iounmap(cxt->virt_addr); - release_mem_region(cxt->phys_addr, cxt->size); + pstore_unregister(&cxt->pstore); cxt->max_dump_cnt = 0; - /* TODO(kees): When pstore supports unregistering, call it here. */ kfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; + persistent_ram_free(cxt->mprz); + persistent_ram_free(cxt->fprz); + persistent_ram_free(cxt->cprz); + ramoops_free_przs(cxt); + return 0; -#endif - return -EBUSY; } static struct platform_driver ramoops_driver = { .probe = ramoops_probe, - .remove = __exit_p(ramoops_remove), + .remove = ramoops_remove, .driver = { .name = "ramoops", }, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ba1323a94924..a586467f6ff6 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,6 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) unsigned order; void *data; int ret; + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); /* make various checks */ order = get_order(newsize); @@ -84,7 +85,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* allocate enough contiguous pages to be able to satisfy the * request */ - pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order); + pages = alloc_pages(gfp, order); if (!pages) return -ENOMEM; @@ -108,7 +109,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) struct page *page = pages + loop; ret = add_to_page_cache_lru(page, inode->i_mapping, loop, - GFP_KERNEL); + gfp); if (ret < 0) goto add_error; diff --git a/fs/sync.c b/fs/sync.c index fbc98ee62044..4ec430ae2b0d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) static void fdatawait_one_bdev(struct block_device *bdev, void *arg) { - filemap_fdatawait(bdev->bd_inode->i_mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); } /* diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 6c95628ea377..f35523d4fa3a 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -108,6 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); struct kobject *kobj = of->kn->parent->priv; + size_t len; /* * If buf != of->prealloc_buf, we don't know how @@ -115,7 +116,8 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, */ if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) return 0; - return ops->show(kobj, of->kn->priv, buf); + len = ops->show(kobj, of->kn->priv, buf); + return min(count, len); } /* kernfs write callback for regular sysfs files */ diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 39a019936768..e1236594fffe 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -352,3 +352,47 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); + +/** + * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing + * to a group or an attribute + * @kobj: The kobject containing the group. + * @target_kobj: The target kobject. + * @target_name: The name of the target group or attribute. + */ +int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, + struct kobject *target_kobj, + const char *target_name) +{ + struct kernfs_node *target; + struct kernfs_node *entry; + struct kernfs_node *link; + + /* + * We don't own @target_kobj and it may be removed at any time. + * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() + * for details. + */ + spin_lock(&sysfs_symlink_target_lock); + target = target_kobj->sd; + if (target) + kernfs_get(target); + spin_unlock(&sysfs_symlink_target_lock); + if (!target) + return -ENOENT; + + entry = kernfs_find_and_get(target_kobj->sd, target_name); + if (!entry) { + kernfs_put(target); + return -ENOENT; + } + + link = kernfs_create_link(kobj->sd, target_name, entry); + if (IS_ERR(link) && PTR_ERR(link) == -EEXIST) + sysfs_warn_dup(kobj->sd, target_name); + + kernfs_put(entry); + kernfs_put(target); + return IS_ERR(link) ? PTR_ERR(link) : 0; +} +EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index cbc8d5d2755a..c66f2423e1f5 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) dput(dentry); dentry = ERR_PTR(-EEXIST); } - if (IS_ERR(dentry)) + + if (IS_ERR(dentry)) { mutex_unlock(&parent->d_inode->i_mutex); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + } + return dentry; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 96f3448b6eb4..fd65b3f1923c 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -652,11 +652,8 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, { int err; - mutex_lock(&inode->i_mutex); err = security_inode_init_security(inode, dentry, qstr, &init_xattrs, 0); - mutex_unlock(&inode->i_mutex); - if (err) { struct ubifs_info *c = dentry->i_sb->s_fs_info; ubifs_err(c, "cannot initialize security for inode %lu, error %d", diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f9aeb40a7197..50311703135b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -467,8 +467,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * the fault_*wqh. */ spin_lock(&ctx->fault_pending_wqh.lock); - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); spin_unlock(&ctx->fault_pending_wqh.lock); wake_up_poll(&ctx->fd_wqh, POLLHUP); @@ -650,10 +650,10 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx, spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) - __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); + __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range); spin_unlock(&ctx->fault_pending_wqh.lock); } |