diff options
Diffstat (limited to 'fs')
70 files changed, 1309 insertions, 809 deletions
diff --git a/fs/afs/main.c b/fs/afs/main.c index 42dd2e499ed8..35de0c04729f 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void) afs_uuid.time_low = uuidtime; afs_uuid.time_mid = uuidtime >> 32; afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK; - afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME; + afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME; get_random_bytes(&clockseq, 2); afs_uuid.clock_seq_low = clockseq; afs_uuid.clock_seq_hi_and_reserved = (clockseq >> 8) & AFS_UUID_CLOCKHI_MASK; - afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD; + afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD; _debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", afs_uuid.time_low, @@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work) aio_free_ring(ctx); free_percpu(ctx->cpu); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); } @@ -715,8 +717,8 @@ err_ctx: err: mutex_unlock(&ctx->ring_lock); free_percpu(ctx->cpu); - free_percpu(ctx->reqs.pcpu_count); - free_percpu(ctx->users.pcpu_count); + percpu_ref_exit(&ctx->reqs); + percpu_ref_exit(&ctx->users); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a389820d158b..3e11aab9f391 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3437,16 +3437,10 @@ done_unlocked: return 0; } -static int eb_wait(void *word) -{ - io_schedule(); - return 0; -} - void wait_on_extent_buffer_writeback(struct extent_buffer *eb) { - wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, + TASK_UNINTERRUPTIBLE); } static noinline_for_stack int diff --git a/fs/buffer.c b/fs/buffer.c index eba6e4f621ce..8f05111bbb8b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -61,16 +61,9 @@ inline void touch_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(touch_buffer); -static int sleep_on_buffer(void *word) -{ - io_schedule(); - return 0; -} - void __lock_buffer(struct buffer_head *bh) { - wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, - TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); @@ -123,7 +116,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback); */ void __wait_on_buffer(struct buffer_head * bh) { - wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 20d75b8ddb26..b98366f21f9e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3934,13 +3934,6 @@ cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); } -static int -cifs_sb_tcon_pending_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /* find and return a tlink with given uid */ static struct tcon_link * tlink_rb_search(struct rb_root *root, kuid_t uid) @@ -4039,11 +4032,10 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, - cifs_sb_tcon_pending_wait, TASK_INTERRUPTIBLE); if (ret) { cifs_put_tlink(tlink); - return ERR_PTR(ret); + return ERR_PTR(-ERESTARTSYS); } /* if it's good, return it */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e90a1e9aa627..b88b1ade4d3d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3618,13 +3618,6 @@ static int cifs_launder_page(struct page *page) return rc; } -static int -cifs_pending_writers_wait(void *unused) -{ - schedule(); - return 0; -} - void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -3636,7 +3629,7 @@ void cifs_oplock_break(struct work_struct *work) int rc = 0; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, - cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); server->ops->downgrade_oplock(server, cinode, test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a174605f6afa..41de3935caa0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1780,7 +1780,7 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(void *word) +cifs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -1794,8 +1794,8 @@ cifs_revalidate_mapping(struct inode *inode) int rc; unsigned long *flags = &CIFS_I(inode)->flags; - rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, - TASK_KILLABLE); + rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 3b0c62e622da..6bf55d0ed494 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -582,7 +582,7 @@ int cifs_get_writer(struct cifsInodeInfo *cinode) start: rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, - cifs_oplock_break_wait, TASK_KILLABLE); + TASK_KILLABLE); if (rc) return rc; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index e82289047272..afec6450450f 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -59,7 +59,7 @@ #include <linux/gfp.h> #include <net/bluetooth/bluetooth.h> -#include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_sock.h> #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 63146295153b..76c08c2beb2f 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -451,7 +451,7 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf, { char buf[3]; u32 *val = file->private_data; - + if (*val) buf[0] = 'Y'; else diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8c41b52da358..1e3b99d3db0d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -66,7 +66,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev break; } } - return inode; + return inode; } /* SMP-safe */ @@ -317,7 +317,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, goto exit; /* If the parent is not specified, we create it in the root. - * We need the root dentry to do this, which is in the super + * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ @@ -330,7 +330,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, switch (mode & S_IFMT) { case S_IFDIR: error = debugfs_mkdir(parent->d_inode, dentry, mode); - + break; case S_IFLNK: error = debugfs_link(parent->d_inode, dentry, mode, @@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove); */ void debugfs_remove_recursive(struct dentry *dentry) { - struct dentry *child, *next, *parent; + struct dentry *child, *parent; if (IS_ERR_OR_NULL(dentry)) return; @@ -546,30 +546,49 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: mutex_lock(&parent->d_inode->i_mutex); - list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { + loop: + /* + * The parent->d_subdirs is protected by the d_lock. Outside that + * lock, the child can be unlinked and set to be freed which can + * use the d_u.d_child as the rcu head and corrupt this list. + */ + spin_lock(&parent->d_lock); + list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) { if (!debugfs_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { + spin_unlock(&parent->d_lock); mutex_unlock(&parent->d_inode->i_mutex); parent = child; goto down; } - up: + + spin_unlock(&parent->d_lock); + if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); + + /* + * The parent->d_lock protects agaist child from unlinking + * from d_subdirs. When releasing the parent->d_lock we can + * no longer trust that the next pointer is valid. + * Restart the loop. We'll skip this one with the + * debugfs_positive() check. + */ + goto loop; } + spin_unlock(&parent->d_lock); mutex_unlock(&parent->d_inode->i_mutex); child = parent; parent = parent->d_parent; mutex_lock(&parent->d_inode->i_mutex); - if (child != dentry) { - next = list_next_entry(child, d_u.d_child); - goto up; - } + if (child != dentry) + /* go up */ + goto loop; if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); diff --git a/fs/direct-io.c b/fs/direct-io.c index 194d0d122cae..17e39b047de5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -71,7 +71,6 @@ struct dio_submit { been performed at the start of a write */ int pages_in_io; /* approximate total IO pages */ - size_t size; /* total request size (doesn't change)*/ sector_t block_in_file; /* Current offset into the underlying file in dio_block units. */ unsigned blocks_available; /* At block_in_file. changes */ @@ -1104,7 +1103,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, unsigned blkbits = i_blkbits; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; - loff_t end = offset + iov_iter_count(iter); + size_t count = iov_iter_count(iter); + loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; struct buffer_head map_bh = { 0, }; @@ -1287,10 +1287,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - ((rw == READ) || (dio->result == sdio.size))) + (rw == READ || dio->result == count)) retval = -EIOCBQUEUED; - - if (retval != -EIOCBQUEUED) + else dio_await_completion(dio); if (drop_refcount(dio) == 0) { diff --git a/fs/exec.c b/fs/exec.c index a3d33fe592d6..ab1f1200ce5d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1216,7 +1216,7 @@ EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against - * PTRACE_ATTACH + * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { @@ -1234,7 +1234,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ - if (current->no_new_privs) + if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; t = p; @@ -1272,7 +1272,7 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->egid = current_egid(); if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs && + !task_no_new_privs(current) && kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { /* Set-uid? */ diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fca382037ddd..581ef40fbe90 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -639,7 +639,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, if (!(*errp) && ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ef1bed66c14f..0bb3f9ea0832 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -571,6 +571,31 @@ static int ext4_release_dir(struct inode *inode, struct file *filp) return 0; } +int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, + int buf_size) +{ + struct ext4_dir_entry_2 *de; + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size; + while ((char *) de < top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -EIO; + + return 0; +} + const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7cc5a0e23688..5b19760b1de5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -591,7 +591,6 @@ enum { #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 -#define EXT4_FREE_BLOCKS_RESERVE 0x0040 /* * ioctl commands @@ -2029,6 +2028,8 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype) return ext4_filetype_table[filetype]; } +extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); @@ -2144,8 +2145,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); -extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop); +extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); @@ -2560,7 +2561,6 @@ extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ -extern int ext4_has_inline_data(struct inode *inode); extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, @@ -2626,6 +2626,12 @@ extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); +static inline int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4da228a0e6d0..76c2df382b7d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,6 +161,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; + + WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ @@ -1808,8 +1810,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | - EXT4_FREE_BLOCKS_RESERVE); + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* @@ -3253,7 +3254,7 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); + ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -5403,16 +5404,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) int ret; /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || - len & (EXT4_BLOCK_SIZE(sb) - 1)) + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) return -EINVAL; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) - return -EOPNOTSUPP; - trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8695f70af1ef..aca7b24a4432 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -200,10 +200,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; file_accessed(file); vma->vm_ops = &ext4_file_vm_ops; return 0; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index fd69da194826..e75f840000a0 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1295,97 +1295,220 @@ do_indirects: } } -static int free_hole_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, __le32 *i_data, - int level, ext4_lblk_t first, - ext4_lblk_t count, int max) +/** + * ext4_ind_remove_space - remove space from the range + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @start: First block to remove + * @end: One block after the last block to remove (exclusive) + * + * Free the blocks in the defined range (end is exclusive endpoint of + * range). This is used by ext4_punch_hole(). + */ +int ext4_ind_remove_space(handle_t *handle, struct inode *inode, + ext4_lblk_t start, ext4_lblk_t end) { - struct buffer_head *bh = NULL; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ret = 0; - int i, inc; - ext4_lblk_t offset; - __le32 blk; - - inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); - for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { - if (offset >= count + first) - break; - if (*i_data == 0 || (offset + inc) <= first) - continue; - blk = *i_data; - if (level > 0) { - ext4_lblk_t first2; - ext4_lblk_t count2; + ext4_lblk_t offsets[4], offsets2[4]; + Indirect chain[4], chain2[4]; + Indirect *partial, *partial2; + ext4_lblk_t max_block; + __le32 nr = 0, nr2 = 0; + int n = 0, n2 = 0; + unsigned blocksize = inode->i_sb->s_blocksize; - bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), - "Read failure"); - return -EIO; - } - if (first > offset) { - first2 = first - offset; - count2 = count; + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + if (end >= max_block) + end = max_block; + if ((start >= end) || (start > max_block)) + return 0; + + n = ext4_block_to_path(inode, start, offsets, NULL); + n2 = ext4_block_to_path(inode, end, offsets2, NULL); + + BUG_ON(n > n2); + + if ((n == 1) && (n == n2)) { + /* We're punching only within direct block range */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + offsets2[0]); + return 0; + } else if (n2 > n) { + /* + * Start and end are on a different levels so we're going to + * free partial block at start, and partial block at end of + * the range. If there are some levels in between then + * do_indirects label will take care of that. + */ + + if (n == 1) { + /* + * Start is at the direct block level, free + * everything to the end of the level. + */ + ext4_free_data(handle, inode, NULL, i_data + offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto end_range; + } + + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; } else { - first2 = 0; - count2 = count - (offset - first); + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); } - ret = free_hole_blocks(handle, inode, bh, - (__le32 *)bh->b_data, level - 1, - first2, count2, - inode->i_sb->s_blocksize >> 2); - if (ret) { - brelse(bh); - goto err; + } + + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + +end_range: + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + if (nr2) { + if (partial2 == chain2) { + /* + * Remember, end is exclusive so here we're at + * the start of the next level we're not going + * to free. Everything was covered by the start + * of the range. + */ + return 0; + } else { + /* Shared branch grows from an indirect block */ + partial2--; } + } else { + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; } - if (level == 0 || - (bh && all_zeroes((__le32 *)bh->b_data, - (__le32 *)bh->b_data + addr_per_block))) { - ext4_free_data(handle, inode, parent_bh, - i_data, i_data + 1); + + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + while (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n2-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; } - brelse(bh); - bh = NULL; + goto do_indirects; } -err: - return ret; -} - -int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t first, ext4_lblk_t stop) -{ - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int level, ret = 0; - int num = EXT4_NDIR_BLOCKS; - ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; - __le32 *i_data = EXT4_I(inode)->i_data; - - count = stop - first; - for (level = 0; level < 4; level++, max *= addr_per_block) { - if (first < max) { - ret = free_hole_blocks(handle, inode, NULL, i_data, - level, first, count, num); - if (ret) - goto err; - if (count > max - first) - count -= max - first; - else - break; - first = 0; - } else { - first -= max; + /* Punch happened within the same level (n == n2) */ + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + /* + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element + */ + partial2->p++; + while ((partial > chain) || (partial2 > chain2)) { + /* We're at the same block, so we're almost finished */ + if ((partial->bh && partial2->bh) && + (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { + if ((partial > chain) && (partial2 > chain2)) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + partial2->p, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + } + return 0; } - i_data += num; - if (level == 0) { - num = 1; - max = 1; + /* + * Clear the ends of indirect blocks on the shared branch + * at the start of the range + */ + if (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + /* + * Clear the ends of indirect blocks on the shared branch + * at the end of the range + */ + if (partial2 > chain2) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, + (chain2+n-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; } } -err: - return ret; +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + if (++n >= n2) + return 0; + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + if (++n >= n2) + return 0; + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + return 0; } - diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 645205d8ada6..bea662bd0ca6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -120,12 +120,6 @@ int ext4_get_max_inline_size(struct inode *inode) return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; } -int ext4_has_inline_data(struct inode *inode) -{ - return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && - EXT4_I(inode)->i_inline_off; -} - /* * this function does not take xattr_sem, which is OK because it is * currently only used in a code path coming form ext4_iget, before @@ -1178,6 +1172,18 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, if (error < 0) goto out; + /* + * Make sure the inline directory entries pass checks before we try to + * convert them, so that we avoid touching stuff that needs fsck. + */ + if (S_ISDIR(inode->i_mode)) { + error = ext4_check_all_de(inode, iloc->bh, + buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + if (error) + goto out; + } + error = ext4_destroy_inline_data_nolock(handle, inode); if (error) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8a064734e6eb..367a60c07cf0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -325,18 +325,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) #endif /* - * Calculate the number of metadata blocks need to reserve - * to allocate a block located at @lblock - */ -static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_calc_metadata_amount(inode, lblock); - - return ext4_ind_calc_metadata_amount(inode, lblock); -} - -/* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ @@ -357,35 +345,10 @@ void ext4_da_update_reserve_space(struct inode *inode, used = ei->i_reserved_data_blocks; } - if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { - ext4_warning(inode->i_sb, "ino %lu, allocated %d " - "with only %d reserved metadata blocks " - "(releasing %d blocks with reserved %d data blocks)", - inode->i_ino, ei->i_allocated_meta_blocks, - ei->i_reserved_meta_blocks, used, - ei->i_reserved_data_blocks); - WARN_ON(1); - ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; - } - /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; - ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - used + ei->i_allocated_meta_blocks); - ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); /* Update quota subsystem for data blocks */ @@ -1222,49 +1185,6 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a metadata for a single block located at lblock - */ -static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; - ext4_lblk_t save_last_lblock; - int save_len; - - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ - spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - save_len = ei->i_da_metadata_calc_len; - save_last_lblock = ei->i_da_metadata_calc_last_lblock; - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); - - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed, 0)) { - ei->i_da_metadata_calc_len = save_len; - ei->i_da_metadata_calc_last_lblock = save_last_lblock; - spin_unlock(&ei->i_block_reservation_lock); - return -ENOSPC; - } - ei->i_reserved_meta_blocks += md_needed; - spin_unlock(&ei->i_block_reservation_lock); - - return 0; /* success */ -} - -/* * Reserve a single cluster located at lblock */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) @@ -1273,8 +1193,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; int ret; - ext4_lblk_t save_last_lblock; - int save_len; /* * We will charge metadata quota at writeout time; this saves @@ -1295,25 +1213,15 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * ext4_calc_metadata_amount() has side effects, which we have * to be prepared undo if we fail to claim space. */ - save_len = ei->i_da_metadata_calc_len; - save_last_lblock = ei->i_da_metadata_calc_last_lblock; - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); + md_needed = 0; + trace_ext4_da_reserve_space(inode, 0); - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { - ei->i_da_metadata_calc_len = save_len; - ei->i_da_metadata_calc_last_lblock = save_last_lblock; + if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; - ei->i_reserved_meta_blocks += md_needed; spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1346,20 +1254,6 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } ei->i_reserved_data_blocks -= to_free; - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - * Note that in case of bigalloc, i_reserved_meta_blocks, - * i_reserved_data_blocks, etc. refer to number of clusters. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } - /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); @@ -1500,10 +1394,6 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); - ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - ei->i_reserved_meta_blocks); - ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u", - ei->i_allocated_meta_blocks); return; } @@ -1620,13 +1510,6 @@ add_delayed: retval = ret; goto out_unlock; } - } else { - ret = ext4_da_reserve_metadata(inode, iblock); - if (ret) { - /* not enough space to reserve */ - retval = ret; - goto out_unlock; - } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2843,8 +2726,7 @@ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); - if (!EXT4_I(inode)->i_reserved_data_blocks && - !EXT4_I(inode)->i_reserved_meta_blocks) + if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* @@ -3624,7 +3506,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else - ret = ext4_free_hole_blocks(handle, inode, first_block, + ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2dcb936be90e..956027711faf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3075,8 +3075,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { - start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; - size = ac->ac_o_ex.fe_len << bsbits; + start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; + size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; @@ -3216,8 +3217,27 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; + struct ext4_buddy e4b; + int err; - if (pa && pa->pa_type == MB_INODE_PA) + if (pa == NULL) { + err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); + if (err) { + /* + * This should never happen since we pin the + * pages in the ext4_allocation_context so + * ext4_mb_load_buddy() should never fail. + */ + WARN(1, "mb_load_buddy failed (%d)", err); + return; + } + ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, + ac->ac_f_ex.fe_len); + ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + return; + } + if (pa->pa_type == MB_INODE_PA) pa->pa_free += ac->ac_b_ex.fe_len; } @@ -4627,7 +4647,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; @@ -4838,19 +4857,7 @@ do_more: &sbi->s_flex_groups[flex_group].free_clusters); } - if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { - percpu_counter_add(&sbi->s_dirtyclusters_counter, - count_clusters); - spin_lock(&ei->i_block_reservation_lock); - if (flags & EXT4_FREE_BLOCKS_METADATA) - ei->i_reserved_meta_blocks += count_clusters; - else - ei->i_reserved_data_blocks += count_clusters; - spin_unlock(&ei->i_block_reservation_lock); - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_reclaim_block(inode, - EXT4_C2B(sbi, count_clusters)); - } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index ec092437d3e0..d3567f27bae7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -39,6 +39,8 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); + /* Locking only for convinience since we are operating on temp inode */ + down_write(&EXT4_I(inode)->i_data_sem); path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { @@ -61,7 +63,9 @@ static int finish_range(handle_t *handle, struct inode *inode, */ if (needed && ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS)) { + up_write((&EXT4_I(inode)->i_data_sem)); retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); if (retval) goto err_out; } else if (needed) { @@ -70,13 +74,16 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * IF not able to extend the journal restart the journal */ + up_write((&EXT4_I(inode)->i_data_sem)); retval = ext4_journal_restart(handle, needed); + down_write((&EXT4_I(inode)->i_data_sem)); if (retval) goto err_out; } } retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); err_out: + up_write((&EXT4_I(inode)->i_data_sem)); if (path) { ext4_ext_drop_refs(path); kfree(path); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2484c7ec6a72..671a74b14fd7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1013,10 +1013,11 @@ data_copy: *err = -EBUSY; goto unlock_pages; } - + ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, orig_blk_offset, block_len_in_page, err); + ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6df7bc611dbd..32b43ad154b9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2142,10 +2142,6 @@ static int ext4_check_descriptors(struct super_block *sb, } if (NULL != first_not_zeroed) *first_not_zeroed = grp; - - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, ext4_count_free_clusters(sb))); - sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -3883,13 +3879,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); goto failed_mount2; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) - if (!ext4_fill_flex_info(sb)) { - ext4_msg(sb, KERN_ERR, - "unable to initialize " - "flex_bg meta info!"); - goto failed_mount2; - } sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -3902,23 +3891,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - err = percpu_counter_init(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); - } - if (!err) { - err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0); - } - if (err) { + if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -4022,18 +3995,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - /* - * The journal may have updated the bg summary counts, so we - * need to update the global counters. - */ - percpu_counter_set(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - percpu_counter_set(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - percpu_counter_set(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); - no_journal: if (ext4_mballoc_ready) { sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); @@ -4141,6 +4102,33 @@ no_journal: goto failed_mount5; } + block = ext4_count_free_clusters(sb); + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount6; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + goto failed_mount6; + } + err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; @@ -4215,6 +4203,12 @@ failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); @@ -4233,12 +4227,6 @@ failed_mount_wq: failed_mount3: ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); - if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); @@ -4556,11 +4544,13 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) + ext4_free_blocks_count_set(es, EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeclusters_counter))); - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( + if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index dbe2141d10ad..83b9b5a8d112 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -203,12 +203,6 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; - if (acl) { - error = posix_acl_valid(acl); - if (error < 0) - return error; - } - switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0b4710c1d370..6aeed5bada52 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -22,7 +22,7 @@ #include "segment.h" #include <trace/events/f2fs.h> -static struct kmem_cache *orphan_entry_slab; +static struct kmem_cache *ino_entry_slab; static struct kmem_cache *inode_entry_slab; /* @@ -282,72 +282,120 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct ino_entry *e; +retry: + spin_lock(&sbi->ino_lock[type]); + + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (!e) { + e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); + if (!e) { + spin_unlock(&sbi->ino_lock[type]); + goto retry; + } + if (radix_tree_insert(&sbi->ino_root[type], ino, e)) { + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + goto retry; + } + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &sbi->ino_list[type]); + } + spin_unlock(&sbi->ino_lock[type]); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + struct ino_entry *e; + + spin_lock(&sbi->ino_lock[type]); + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[type], ino); + if (type == ORPHAN_INO) + sbi->n_orphans--; + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + return; + } + spin_unlock(&sbi->ino_lock[type]); +} + +void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, type); +} + +void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO or UPDATE_INO */ +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct ino_entry *e; + spin_lock(&sbi->ino_lock[mode]); + e = radix_tree_lookup(&sbi->ino_root[mode], ino); + spin_unlock(&sbi->ino_lock[mode]); + return e ? true : false; +} + +static void release_dirty_inode(struct f2fs_sb_info *sbi) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = APPEND_INO; i <= UPDATE_INO; i++) { + spin_lock(&sbi->ino_lock[i]); + list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[i], e->ino); + kmem_cache_free(ino_entry_slab, e); + } + spin_unlock(&sbi->ino_lock[i]); + } +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { int err = 0; - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *new, *orphan; - - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - new->ino = ino; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, new); - return; - } - - if (orphan->ino > ino) - break; - } - - /* add new orphan entry into list which is sorted by inode number */ - list_add_tail(&new->list, &orphan->list); - spin_unlock(&sbi->orphan_inode_lock); + /* add new orphan ino entry into list */ + __add_ino_entry(sbi, ino, ORPHAN_INO); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *orphan; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - list_del(&orphan->list); - f2fs_bug_on(sbi->n_orphans == 0); - sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, orphan); - return; - } - } - spin_unlock(&sbi->orphan_inode_lock); + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino, ORPHAN_INO); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -401,14 +449,14 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); struct page *page = NULL; - struct orphan_inode_entry *orphan = NULL; + struct ino_entry *orphan = NULL; for (index = 0; index < orphan_blocks; index++) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; + spin_lock(&sbi->ino_lock[ORPHAN_INO]); + head = &sbi->ino_list[ORPHAN_INO]; /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { @@ -448,7 +496,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) f2fs_put_page(page, 1); } - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -714,10 +762,10 @@ retry_flush_dents: * until finishing nat/sit flush. */ retry_flush_nodes: - mutex_lock(&sbi->node_write); + down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); sync_node_pages(sbi, 0, &wbc); goto retry_flush_nodes; } @@ -726,7 +774,7 @@ retry_flush_nodes: static void unblock_operations(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); f2fs_unlock_all(sbi); } @@ -748,6 +796,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); nid_t last_nid = 0; block_t start_blk; struct page *cp_page; @@ -761,7 +810,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi); + discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) @@ -885,8 +934,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { + if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { clear_prefree_segments(sbi); + release_dirty_inode(sbi); F2FS_RESET_SB_DIRT(sbi); } } @@ -932,31 +982,37 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } -void init_orphan_info(struct f2fs_sb_info *sbi) +void init_ino_entry_info(struct f2fs_sb_info *sbi) { - spin_lock_init(&sbi->orphan_inode_lock); - INIT_LIST_HEAD(&sbi->orphan_inode_list); - sbi->n_orphans = 0; + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC); + spin_lock_init(&sbi->ino_lock[i]); + INIT_LIST_HEAD(&sbi->ino_list[i]); + } + /* * considering 512 blocks in a segment 8 blocks are needed for cp * and log segment summaries. Remaining blocks are used to keep * orphan entries with the limitation one reserved segment * for cp pack we can have max 1020*504 orphan entries */ + sbi->n_orphans = 0; sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { - orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry)); - if (!orphan_entry_slab) + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; @@ -964,6 +1020,6 @@ int __init create_checkpoint_caches(void) void destroy_checkpoint_caches(void) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(inode_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8cf619edb5f..03313099c51c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -139,7 +139,10 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + if (test_opt(sbi, NOBARRIER)) + io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + else + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); up_write(&io->io_rwsem); @@ -626,8 +629,10 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (check_extent_cache(inode, pgofs, bh_result)) goto out; - if (create) + if (create) { + f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -784,9 +789,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) !is_cold_data(page) && need_inplace_update(inode))) { rewrite_data_page(page, old_blkaddr, fio); + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); } else { write_data_page(page, &dn, &new_blkaddr, fio); update_extent_cache(new_blkaddr, &dn); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); } out_writepage: f2fs_put_dnode(&dn); @@ -914,6 +921,16 @@ skip_write: return 0; } +static void f2fs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + truncate_blocks(inode, inode->i_size); + } +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -931,11 +948,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, repeat: err = f2fs_convert_inline_data(inode, pos + len); if (err) - return err; + goto fail; page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto fail; + } /* to avoid latency during memory pressure */ unlock_page(page); @@ -949,10 +968,9 @@ repeat: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); - if (err) { f2fs_put_page(page, 0); - return err; + goto fail; } inline_data: lock_page(page); @@ -982,19 +1000,20 @@ inline_data: err = f2fs_read_inline_data(inode, page); if (err) { page_cache_release(page); - return err; + goto fail; } } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) - return err; + goto fail; } lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return -EIO; + err = -EIO; + goto fail; } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); @@ -1005,6 +1024,9 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; +fail: + f2fs_write_failed(mapping, pos + len); + return err; } static int f2fs_write_end(struct file *file, @@ -1016,7 +1038,6 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - SetPageUptodate(page); set_page_dirty(page); if (pos + copied > i_size_read(inode)) { @@ -1050,7 +1071,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + int err; /* Let buffer I/O handle the inline data case. */ if (f2fs_has_inline_data(inode)) @@ -1062,8 +1086,15 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, /* clear fsync mark to recover these blocks */ fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); - return blockdev_direct_IO(rw, iocb, inode, iter, offset, - get_data_block); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + + err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); + if (err < 0 && (rw & WRITE)) + f2fs_write_failed(mapping, offset + count); + + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); + + return err; } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b52c12cf5873..a441ba33be11 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -167,7 +167,7 @@ get_cache: si->cache_mem += npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); + si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); } @@ -345,21 +345,14 @@ void __init f2fs_create_root_stats(void) f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - goto bail; + return; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); - if (!file) - goto free_debugfs_dir; - - return; - -free_debugfs_dir: - debugfs_remove(f2fs_debugfs_root); - -bail: - f2fs_debugfs_root = NULL; - return; + if (!file) { + debugfs_remove(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; + } } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a4addd72ebbd..bcf893c3d903 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -77,8 +77,8 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static bool early_match_name(const char *name, size_t namelen, - f2fs_hash_t namehash, struct f2fs_dir_entry *de) +static bool early_match_name(size_t namelen, f2fs_hash_t namehash, + struct f2fs_dir_entry *de) { if (le16_to_cpu(de->name_len) != namelen) return false; @@ -90,7 +90,7 @@ static bool early_match_name(const char *name, size_t namelen, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, size_t namelen, int *max_slots, + struct qstr *name, int *max_slots, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; @@ -109,9 +109,10 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, continue; } de = &dentry_blk->dentry[bit_pos]; - if (early_match_name(name, namelen, namehash, de)) { + if (early_match_name(name->len, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], - name, namelen)) { + name->name, + name->len)) { *res_page = dentry_page; goto found; } @@ -120,6 +121,13 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, *max_slots = max_len; max_len = 0; } + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs are occurred. + */ + f2fs_bug_on(!de->name_len); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -132,10 +140,10 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, size_t namelen, + unsigned int level, struct qstr *name, f2fs_hash_t namehash, struct page **res_page) { - int s = GET_DENTRY_SLOTS(namelen); + int s = GET_DENTRY_SLOTS(name->len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; @@ -160,8 +168,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, continue; } - de = find_in_block(dentry_page, name, namelen, - &max_slots, namehash, res_page); + de = find_in_block(dentry_page, name, &max_slots, + namehash, res_page); if (de) break; @@ -187,8 +195,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct qstr *child, struct page **res_page) { - const char *name = child->name; - size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; f2fs_hash_t name_hash; @@ -200,12 +206,11 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, *res_page = NULL; - name_hash = f2fs_dentry_hash(name, namelen); + name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, name, - namelen, name_hash, res_page); + de = find_in_level(dir, level, child, name_hash, res_page); if (de) break; } @@ -298,14 +303,13 @@ static int make_empty_dir(struct inode *inode, struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dir_entry *de; - void *kaddr; dentry_page = get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); de = &dentry_blk->dentry[0]; de->name_len = cpu_to_le16(1); @@ -323,7 +327,7 @@ static int make_empty_dir(struct inode *inode, test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); @@ -333,11 +337,12 @@ static int make_empty_dir(struct inode *inode, static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct page *page; int err; if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - page = new_inode_page(inode, name); + page = new_inode_page(inode); if (IS_ERR(page)) return page; @@ -362,7 +367,8 @@ static struct page *init_inode_metadata(struct inode *inode, set_cold_node(inode, page); } - init_dent_inode(name, page); + if (name) + init_dent_inode(name, page); /* * This file should be checkpointed during fsync. @@ -370,6 +376,12 @@ static struct page *init_inode_metadata(struct inode *inode, */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + remove_orphan_inode(sbi, inode->i_ino); inc_nlink(inode); } return page; @@ -453,7 +465,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, int err = 0; int i; - dentry_hash = f2fs_dentry_hash(name->name, name->len); + dentry_hash = f2fs_dentry_hash(name); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -529,6 +541,27 @@ fail: return err; } +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + /* we don't need to mark_inode_dirty now */ + update_inode(inode, page); + f2fs_put_page(page, 1); + + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); +fail: + up_write(&F2FS_I(inode)->i_sem); + return err; +} + /* * It only removes the dentry from the dentry page,corresponding name * entry in name page does not need to be touched during deletion. @@ -541,14 +574,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - void *kaddr = page_address(page); int i; lock_page(page); f2fs_wait_on_page_writeback(page, DATA); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); @@ -603,7 +635,6 @@ bool f2fs_empty_dir(struct inode *dir) unsigned long nblock = dir_blocks(dir); for (bidx = 0; bidx < nblock; bidx++) { - void *kaddr; dentry_page = get_lock_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) @@ -612,8 +643,8 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -621,7 +652,7 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58df97e174d0..4dab5338a97a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -41,6 +41,7 @@ #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 #define F2FS_MOUNT_FLUSH_MERGE 0x00000200 +#define F2FS_MOUNT_NOBARRIER 0x00000400 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -99,8 +100,15 @@ enum { META_SSA }; -/* for the list of orphan inodes */ -struct orphan_inode_entry { +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { struct list_head list; /* list head */ nid_t ino; /* inode number */ }; @@ -256,6 +264,8 @@ struct f2fs_nm_info { unsigned int nat_cnt; /* the # of cached nat entries */ struct list_head nat_entries; /* cached nat entry list (clean) */ struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + struct list_head nat_entry_set; /* nat entry set list */ + unsigned int dirty_nat_cnt; /* total num of nat entries in set */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -442,14 +452,17 @@ struct f2fs_sb_info { struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct mutex node_write; /* locking node writes */ + struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ wait_queue_head_t cp_wait; - /* for orphan inode management */ - struct list_head orphan_inode_list; /* orphan inode list */ - spinlock_t orphan_inode_lock; /* for orphan inode list */ + /* for inode management */ + struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */ + spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */ + struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */ + + /* for orphan inode, use 0'th array */ unsigned int n_orphans; /* # of orphan inodes */ unsigned int max_orphans; /* max orphan inodes */ @@ -768,7 +781,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; else - return ((unsigned char *)ckpt + F2FS_BLKSIZE); + return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; @@ -983,11 +996,15 @@ enum { FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used fo ipu for fdatasync */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) { - set_bit(flag, &fi->flags); + if (!test_bit(flag, &fi->flags)) + set_bit(flag, &fi->flags); } static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) @@ -997,7 +1014,8 @@ static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) { - clear_bit(flag, &fi->flags); + if (test_bit(flag, &fi->flags)) + clear_bit(flag, &fi->flags); } static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) @@ -1136,6 +1154,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +int f2fs_do_tmpfile(struct inode *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); @@ -1155,7 +1174,7 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, size_t); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); /* * node.c @@ -1173,7 +1192,7 @@ int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); void remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); @@ -1185,6 +1204,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +void recover_inline_xattr(struct inode *, struct page *); bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, @@ -1206,7 +1226,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); @@ -1240,6 +1260,9 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); @@ -1251,7 +1274,7 @@ void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); -void init_orphan_info(struct f2fs_sb_info *); +void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1295,7 +1318,6 @@ bool space_for_roll_forward(struct f2fs_sb_info *); struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; - struct mutex stat_lock; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; int hit_ext, total_ext; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7d8b96275092..208f1a9bd569 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -127,12 +127,30 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return 0; trace_f2fs_sync_file_enter(inode); + + /* if fdatasync is triggered, let's do in-place-update */ + if (datasync) + set_inode_flag(fi, FI_NEED_IPU); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (datasync) + clear_inode_flag(fi, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + !exist_written_data(sbi, inode->i_ino, APPEND_INO)) { + if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + exist_written_data(sbi, inode->i_ino, UPDATE_INO)) + goto flush_out; + goto out; + } + /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -188,6 +206,13 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; + + /* once recovery info is written, don't need to tack this */ + remove_dirty_inode(sbi, inode->i_ino, APPEND_INO); + clear_inode_flag(fi, FI_APPEND_WRITE); +flush_out: + remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: @@ -206,8 +231,9 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, /* find first dirty page index */ pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, + PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -272,8 +298,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -380,13 +405,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) { - f2fs_put_page(page, 1); - return; - } + if (unlikely(!PageUptodate(page) || + page->mapping != inode->i_mapping)) + goto out; + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); + +out: f2fs_put_page(page, 1); } @@ -645,6 +672,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; + f2fs_balance_fs(sbi); + ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b90dbe55403a..d7947d90ccc3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -186,7 +186,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int hint = 0; unsigned int secno; /* @@ -194,11 +193,9 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) * selected by background GC before. * Those segments guarantee they have small valid blocks. */ -next: - secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); - if (secno < TOTAL_SECS(sbi)) { + for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) { if (sec_usage_check(sbi, secno)) - goto next; + continue; clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 6eb8d269b53b..948d17bf7281 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -69,12 +69,14 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; const char *p; __u32 in[8], buf[4]; + const char *name = name_info->name; + size_t len = name_info->len; if ((len <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1bba5228c197..5beeccef9ae1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -172,6 +172,7 @@ int f2fs_write_inline_data(struct inode *inode, stat_inc_inline_inode(inode); } + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); sync_inode_page(&dn); f2fs_put_dnode(&dn); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2cf6962f6cc8..2c39999f3868 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -267,13 +267,14 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) - goto no_delete; + goto out_clear; f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); @@ -295,6 +296,13 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: - clear_inode(inode); invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); + if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); +out_clear: + clear_inode(inode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a6bdddc33ce2..27b03776ffd2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -13,6 +13,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/dcache.h> #include "f2fs.h" #include "node.h" @@ -22,14 +23,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); nid_t ino; struct inode *inode; bool nid_free = false; int err; - inode = new_inode(sb); + inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -102,8 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; nid_t ino = 0; int err; @@ -146,8 +145,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int err; f2fs_balance_fs(sbi); @@ -207,8 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode = dentry->d_inode; struct f2fs_dir_entry *de; struct page *page; @@ -242,8 +239,7 @@ fail: static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; size_t symlen = strlen(symname) + 1; int err; @@ -330,8 +326,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) static int f2fs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; int err = 0; @@ -369,8 +364,7 @@ out: static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct super_block *sb = old_dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb); struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; @@ -393,8 +387,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - f2fs_lock_op(sbi); - if (new_inode) { err = -ENOTEMPTY; @@ -407,6 +399,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -435,9 +429,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_lock_op(sbi); + err = f2fs_add_link(new_dentry, old_inode); - if (err) + if (err) { + f2fs_unlock_op(sbi); goto out_dir; + } if (old_dir_entry) { inc_nlink(new_dir); @@ -472,6 +470,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: + f2fs_unlock_op(sbi); kunmap(new_page); f2fs_put_page(new_page, 0); out_dir: @@ -479,7 +478,151 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } +out_old: + kunmap(old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err = -ENOENT; + + f2fs_balance_fs(sbi); + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) + goto out_old; + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) + goto out_new; + } + + if (S_ISDIR(new_inode->i_mode)) { + err = -EIO; + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) + goto out_old_dir; + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_lock_op(sbi); + + err = update_dent_inode(old_inode, &new_dentry->d_name); + if (err) + goto out_unlock; + + err = update_dent_inode(new_inode, &old_dentry->d_name); + if (err) + goto out_undo; + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + + update_inode_page(old_inode); + + old_dir->i_ctime = CURRENT_TIME; + if (old_nlink) { + down_write(&F2FS_I(old_dir)->i_sem); + if (old_nlink < 0) + drop_nlink(old_dir); + else + inc_nlink(old_dir); + up_write(&F2FS_I(old_dir)->i_sem); + } + mark_inode_dirty(old_dir); + update_inode_page(old_dir); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + down_write(&F2FS_I(new_inode)->i_sem); + file_lost_pino(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + update_inode_page(new_inode); + + new_dir->i_ctime = CURRENT_TIME; + if (new_nlink) { + down_write(&F2FS_I(new_dir)->i_sem); + if (new_nlink < 0) + drop_nlink(new_dir); + else + inc_nlink(new_dir); + up_write(&F2FS_I(new_dir)->i_sem); + } + mark_inode_dirty(new_dir); + update_inode_page(new_dir); + + f2fs_unlock_op(sbi); + return 0; +out_undo: + /* Still we may fail to recover name info of f2fs_inode here */ + update_dent_inode(old_inode, &old_dentry->d_name); +out_unlock: f2fs_unlock_op(sbi); +out_new_dir: + if (new_dir_entry) { + kunmap(new_dir_page); + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + kunmap(old_dir_page); + f2fs_put_page(old_dir_page, 0); + } +out_new: + kunmap(new_page); + f2fs_put_page(new_page, 0); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); @@ -487,6 +630,71 @@ out: return err; } +static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct inode *inode; + int err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; + +release_out: + release_orphan_inode(sbi); +out: + f2fs_unlock_op(sbi); + clear_nlink(inode); + unlock_new_inode(inode); + make_bad_inode(inode); + iput(inode); + alloc_nid_failed(sbi, inode->i_ino); + return err; +} + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, @@ -497,6 +705,8 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .rename2 = f2fs_rename2, + .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b697ccc9b0c..d3d90d284631 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -25,6 +25,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; bool available_free_memory(struct f2fs_sb_info *sbi, int type) { @@ -90,12 +91,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_meta_page(sbi, src_off); - - /* Dirty src_page means that it is already the new target NAT page. */ - if (PageDirty(src_page)) - return src_page; - dst_page = grab_meta_page(sbi, dst_off); + f2fs_bug_on(PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -845,7 +842,7 @@ void remove_inode_page(struct inode *inode) truncate_node(&dn); } -struct page *new_inode_page(struct inode *inode, const struct qstr *name) +struct page *new_inode_page(struct inode *inode) { struct dnode_of_data dn; @@ -1234,12 +1231,12 @@ static int f2fs_write_node_page(struct page *page, if (wbc->for_reclaim) goto redirty_out; - mutex_lock(&sbi->node_write); + down_read(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - mutex_unlock(&sbi->node_write); + up_read(&sbi->node_write); unlock_page(page); return 0; @@ -1552,7 +1549,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } -static void recover_inline_xattr(struct inode *inode, struct page *page) +void recover_inline_xattr(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); void *src_addr, *dst_addr; @@ -1591,8 +1588,6 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) nid_t new_xnid = nid_of_node(page); struct node_info ni; - recover_inline_xattr(inode, page); - if (!f2fs_has_xattr_block(ofs_of_node(page))) return false; @@ -1744,7 +1739,90 @@ skip: return err; } -static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +static struct nat_entry_set *grab_nat_entry_set(void) +{ + struct nat_entry_set *nes = + f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + nes->entry_cnt = 0; + INIT_LIST_HEAD(&nes->set_list); + INIT_LIST_HEAD(&nes->entry_list); + return nes; +} + +static void release_nat_entry_set(struct nat_entry_set *nes, + struct f2fs_nm_info *nm_i) +{ + f2fs_bug_on(!list_empty(&nes->entry_list)); + + nm_i->dirty_nat_cnt -= nes->entry_cnt; + list_del(&nes->set_list); + kmem_cache_free(nat_entry_set_slab, nes); +} + +static void adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head) +{ + struct nat_entry_set *next = nes; + + if (list_is_last(&nes->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (nes->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&nes->set_list, &next->set_list); +} + +static void add_nat_entry(struct nat_entry *ne, struct list_head *head) +{ + struct nat_entry_set *nes; + nid_t start_nid = START_NID(ne->ni.nid); + + list_for_each_entry(nes, head, set_list) { + if (nes->start_nid == start_nid) { + list_move_tail(&ne->list, &nes->entry_list); + nes->entry_cnt++; + adjust_nat_entry_set(nes, head); + return; + } + } + + nes = grab_nat_entry_set(); + + nes->start_nid = start_nid; + list_move_tail(&ne->list, &nes->entry_list); + nes->entry_cnt++; + list_add(&nes->set_list, head); +} + +static void merge_nats_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct list_head *dirty_list = &nm_i->dirty_nat_entries; + struct list_head *set_list = &nm_i->nat_entry_set; + struct nat_entry *ne, *tmp; + + write_lock(&nm_i->nat_tree_lock); + list_for_each_entry_safe(ne, tmp, dirty_list, list) { + if (nat_get_blkaddr(ne) == NEW_ADDR) + continue; + add_nat_entry(ne, set_list); + nm_i->dirty_nat_cnt++; + } + write_unlock(&nm_i->nat_tree_lock); +} + +static bool __has_cursum_space(struct f2fs_summary_block *sum, int size) +{ + if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES) + return true; + else + return false; +} + +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1752,12 +1830,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) int i; mutex_lock(&curseg->curseg_mutex); - - if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { - mutex_unlock(&curseg->curseg_mutex); - return false; - } - for (i = 0; i < nats_in_cursum(sum); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; @@ -1767,23 +1839,21 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) retry: write_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne) { - __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - continue; - } + if (ne) + goto found; + ne = grab_nat_entry(nm_i, nid); if (!ne) { write_unlock(&nm_i->nat_tree_lock); goto retry; } node_info_from_raw_nat(&ne->ni, &raw_ne); +found: __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); - return true; } /* @@ -1794,80 +1864,91 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct nat_entry *ne, *cur; - struct page *page = NULL; - struct f2fs_nat_block *nat_blk = NULL; - nid_t start_nid = 0, end_nid = 0; - bool flushed; + struct nat_entry_set *nes, *tmp; + struct list_head *head = &nm_i->nat_entry_set; + bool to_journal = true; - flushed = flush_nats_in_journal(sbi); - - if (!flushed) - mutex_lock(&curseg->curseg_mutex); - - /* 1) flush dirty nat caches */ - list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { - nid_t nid; - struct f2fs_nat_entry raw_ne; - int offset = -1; - - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + /* merge nat entries of dirty list to nat entry set temporarily */ + merge_nats_in_set(sbi); - nid = nat_get_nid(ne); + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) { + remove_nats_in_journal(sbi); - if (flushed) - goto to_nat_page; + /* + * merge nat entries of dirty list to nat entry set temporarily + */ + merge_nats_in_set(sbi); + } - /* if there is room for nat enries in curseg->sumpage */ - offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); - if (offset >= 0) { - raw_ne = nat_in_journal(sum, offset); - goto flush_now; - } -to_nat_page: - if (!page || (start_nid > nid || nid > end_nid)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; - } - start_nid = START_NID(nid); - end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; + if (!nm_i->dirty_nat_cnt) + return; - /* - * get nat block with dirty flag, increased reference - * count, mapped and lock - */ + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + list_for_each_entry_safe(nes, tmp, head, set_list) { + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page; + nid_t start_nid = nes->start_nid; + + if (to_journal && !__has_cursum_space(sum, nes->entry_cnt)) + to_journal = false; + + if (to_journal) { + mutex_lock(&curseg->curseg_mutex); + } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); + f2fs_bug_on(!nat_blk); } - f2fs_bug_on(!nat_blk); - raw_ne = nat_blk->entries[nid - start_nid]; -flush_now: - raw_nat_from_node_info(&raw_ne, &ne->ni); - - if (offset < 0) { - nat_blk->entries[nid - start_nid] = raw_ne; - } else { - nat_in_journal(sum, offset) = raw_ne; - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); - if (nat_get_blkaddr(ne) == NULL_ADDR && + if (nat_get_blkaddr(ne) == NULL_ADDR && add_free_nid(sbi, nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - __clear_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); + write_lock(&nm_i->nat_tree_lock); + __del_from_nat_cache(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } else { + write_lock(&nm_i->nat_tree_lock); + __clear_nat_cache_dirty(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } } + + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); + + release_nat_entry_set(nes, nm_i); } - if (!flushed) - mutex_unlock(&curseg->curseg_mutex); - f2fs_put_page(page, 1); + + f2fs_bug_on(!list_empty(head)); + f2fs_bug_on(nm_i->dirty_nat_cnt); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1896,6 +1977,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); INIT_LIST_HEAD(&nm_i->dirty_nat_entries); + INIT_LIST_HEAD(&nm_i->nat_entry_set); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); @@ -1976,19 +2058,30 @@ int __init create_node_manager_caches(void) nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) - return -ENOMEM; + goto fail; free_nid_slab = f2fs_kmem_cache_create("free_nid", sizeof(struct free_nid)); - if (!free_nid_slab) { - kmem_cache_destroy(nat_entry_slab); - return -ENOMEM; - } + if (!free_nid_slab) + goto destory_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destory_free_nid; return 0; + +destory_free_nid: + kmem_cache_destroy(free_nid_slab); +destory_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; } void destroy_node_manager_caches(void) { + kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7281112cd1c8..8a116a407599 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -89,6 +89,13 @@ enum mem_type { DIRTY_DENTS /* indicates dirty dentry pages */ }; +struct nat_entry_set { + struct list_head set_list; /* link with all nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t start_nid; /* start nid of nats in set */ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + /* * For free nid mangement */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a112368a4a86..fe1c6d921ba2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -300,6 +300,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; + recover_inline_xattr(inode, page); + if (recover_inline_data(inode, page)) goto out; @@ -434,7 +436,9 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + block_t blkaddr; int err; bool need_writecp = false; @@ -447,6 +451,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ sbi->por_doing = true; + + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -462,8 +469,21 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + sbi->por_doing = false; - if (!err && need_writecp) + if (err) { + discard_next_dnode(sbi, blkaddr); + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) + sync_meta_pages(sbi, META, LONG_MAX); + } else if (need_writecp) { write_checkpoint(sbi, false); + } return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d04613df710a..0dfeebae2a50 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -239,6 +239,12 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; struct flush_cmd cmd; + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + + if (test_opt(sbi, NOBARRIER)) + return 0; + if (!test_opt(sbi, FLUSH_MERGE)) return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); @@ -272,13 +278,13 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); - sbi->sm_info->cmd_control_info = fcc; + SM_I(sbi)->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; return err; } @@ -287,13 +293,12 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = - sbi->sm_info->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; if (fcc && fcc->f2fs_issue_flush) kthread_stop(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -377,11 +382,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi) +void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - if (f2fs_issue_discard(sbi, blkaddr, 1)) { struct page *page = grab_meta_page(sbi, blkaddr); /* zero-filled page */ @@ -437,17 +439,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; + unsigned int segno; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); - while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) - break; + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs) __set_test_and_free(sbi, segno); - } mutex_unlock(&dirty_i->seglist_lock); } @@ -974,14 +971,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; - unsigned int old_cursegno; curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - old_cursegno = curseg->segno; /* * __add_sum_entry should be resided under the curseg_mutex @@ -1002,7 +997,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); mutex_unlock(&sit_i->sentry_lock); @@ -1532,7 +1526,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) struct page *page = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start = 0, end = 0; - unsigned int segno = -1; + unsigned int segno; bool flushed; mutex_lock(&curseg->curseg_mutex); @@ -1544,7 +1538,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) */ flushed = flush_sits_in_journal(sbi); - while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { + for_each_set_bit(segno, bitmap, nsegs) { struct seg_entry *se = get_seg_entry(sbi, segno); int sit_offset, offset; @@ -1703,7 +1697,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); if (!array) return -ENOMEM; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7091204680f4..55973f7b0330 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -347,8 +347,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), - start_segno); + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; @@ -486,6 +486,10 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode)) return false; + /* this is only set during fdatasync */ + if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + return true; + switch (SM_I(sbi)->ipu_policy) { case F2FS_IPU_FORCE: return true; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8f96d9372ade..657582fc7601 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -52,6 +52,7 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_flush_merge, + Opt_nobarrier, Opt_err, }; @@ -69,6 +70,7 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_flush_merge, "flush_merge"}, + {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL}, }; @@ -339,6 +341,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -544,6 +549,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_data"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -615,7 +622,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; /* @@ -642,8 +649,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { destroy_flush_cmd_control(sbi); - } else if (test_opt(sbi, FLUSH_MERGE) && - !sbi->sm_info->cmd_control_info) { + } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -947,7 +953,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - mutex_init(&sbi->node_write); + init_rwsem(&sbi->node_write); sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); @@ -997,7 +1003,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - init_orphan_info(sbi); + init_ino_entry_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); @@ -1034,8 +1040,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); err = -EINVAL; - goto free_root_inode; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -1082,7 +1089,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index be568b7311d6..ef9bef118342 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -342,7 +342,8 @@ static void __inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); + __wait_on_bit(wqh, &wq, bit_wait, + TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index aec01be91b0a..89acec742e0b 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -160,7 +160,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie, _enter("%p", cookie); wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; @@ -255,7 +255,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) if (!fscache_defer_lookup) { _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; @@ -463,7 +463,6 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) _enter("%p", cookie); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, - fscache_wait_bit_interruptible, TASK_UNINTERRUPTIBLE); _leave(""); @@ -525,7 +524,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) } wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index bc6c08fcfddd..7872a62ef30c 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void) return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); } -extern int fscache_wait_bit(void *); -extern int fscache_wait_bit_interruptible(void *); extern int fscache_wait_atomic_t(atomic_t *); /* diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 63f868e869b9..a31b83c5cbd9 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -197,24 +197,6 @@ static void __exit fscache_exit(void) module_exit(fscache_exit); /* - * wait_on_bit() sleep function for uninterruptible waiting - */ -int fscache_wait_bit(void *flags) -{ - schedule(); - return 0; -} - -/* - * wait_on_bit() sleep function for interruptible waiting - */ -int fscache_wait_bit_interruptible(void *flags) -{ - schedule(); - return signal_pending(current); -} - -/* * wait_on_atomic_t() sleep function for uninterruptible waiting */ int fscache_wait_atomic_t(atomic_t *p) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index ed70714503fa..85332b9d19d1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -298,7 +298,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) jif = jiffies; if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { fscache_stat(&fscache_n_retrievals_intr); _leave(" = -ERESTARTSYS"); @@ -342,7 +341,6 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, if (stat_op_waits) fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit_interruptible, TASK_INTERRUPTIBLE) != 0) { ret = fscache_cancel_op(op, do_cancel); if (ret == 0) @@ -351,7 +349,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, /* it's been removed from the pending queue by another party, * so we should get to run shortly */ wait_on_bit(&op->flags, FSCACHE_OP_WAITING, - fscache_wait_bit, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); } _debug("<<< GO"); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ee4e04fe60fc..7f513b1ceb2c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -856,27 +856,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) } /** - * gfs2_glock_holder_wait - * @word: unused - * - * This function and gfs2_glock_demote_wait both show up in the WCHAN - * field. Thus I've separated these otherwise identical functions in - * order to be more informative to the user. - */ - -static int gfs2_glock_holder_wait(void *word) -{ - schedule(); - return 0; -} - -static int gfs2_glock_demote_wait(void *word) -{ - schedule(); - return 0; -} - -/** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * @@ -888,7 +867,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh) unsigned long time1 = jiffies; might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ /* Lengthen the minimum hold time. */ gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + @@ -1128,7 +1107,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 4fafea1c9ecf..641383a9c1bb 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -936,12 +936,6 @@ fail: return error; } -static int dlm_recovery_wait(void *word) -{ - schedule(); - return 0; -} - static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -976,7 +970,7 @@ restart: fs_info(sdp, "control_first_done wait gen %u\n", start_gen); wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, - dlm_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index bc564c0d6d16..d3eae244076e 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1024,20 +1024,13 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } -static int gfs2_journalid_wait(void *word) -{ - if (signal_pending(current)) - return -EINTR; - schedule(); - return 0; -} - static int wait_on_journal(struct gfs2_sbd *sdp) { if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) return 0; - return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE) + ? -EINTR : 0; } void gfs2_online_uevent(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 94555d4c5698..573bd3b758fa 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -591,12 +591,6 @@ done: wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -static int gfs2_recovery_wait(void *word) -{ - schedule(); - return 0; -} - int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; @@ -609,7 +603,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) BUG_ON(!rv); if (wait) - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); return wait ? jd->jd_recover_error : 0; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1319b5c4ec68..2607ff13d486 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -864,12 +864,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } -static int gfs2_umount_recovery_wait(void *word) -{ - schedule(); - return 0; -} - /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock @@ -894,7 +888,7 @@ restart: continue; spin_unlock(&sdp->sd_jindex_spin); wait_on_bit(&jd->jd_flags, JDF_RECOVERY, - gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + TASK_UNINTERRUPTIBLE); goto restart; } spin_unlock(&sdp->sd_jindex_spin); diff --git a/fs/inode.c b/fs/inode.c index 6eecb7ff0b9a..5938f3928944 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1695,13 +1695,6 @@ int inode_needs_sync(struct inode *inode) } EXPORT_SYMBOL(inode_needs_sync); -int inode_wait(void *word) -{ - schedule(); - return 0; -} -EXPORT_SYMBOL(inode_wait); - /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6f0f590cc5a3..5f09370c90a8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -763,12 +763,6 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } -static int sleep_on_shadow_bh(void *word) -{ - io_schedule(); - return 0; -} - /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -906,8 +900,8 @@ repeat: if (buffer_shadow(bh)) { JBUFFER_TRACE(jh, "on shadow: sleep"); jbd_unlock_bh_state(bh); - wait_on_bit(&bh->b_state, BH_Shadow, - sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&bh->b_state, BH_Shadow, + TASK_UNINTERRUPTIBLE); goto repeat; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index d895b4b7b661..4429d6d9217f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -896,7 +896,7 @@ const struct file_operations kernfs_file_fops = { * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file - * @static_name: don't copy file name + * @name_is_static: don't copy file name * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1812f026960c..daa8e7514eae 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -306,11 +306,9 @@ static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) static void nsm_init_private(struct nsm_handle *nsm) { u64 *p = (u64 *)&nsm->sm_priv.data; - struct timespec ts; s64 ns; - ktime_get_ts(&ts); - ns = timespec_to_ns(&ts); + ns = ktime_get_ns(); put_unaligned(ns, p); put_unaligned((unsigned long)nsm, p + 1); } diff --git a/fs/locks.c b/fs/locks.c index 717fbc404e6b..a6f54802d277 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -325,7 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return -ENOMEM; fl->fl_file = filp; - fl->fl_owner = (fl_owner_t)filp; + fl->fl_owner = filp; fl->fl_pid = current->tgid; fl->fl_flags = FL_FLOCK; fl->fl_type = type; @@ -431,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = (fl_owner_t)current->files; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(posix_lock_file_wait); int locks_mandatory_locked(struct file *file) { struct inode *inode = file_inode(file); - fl_owner_t owner = current->files; struct file_lock *fl; /* @@ -1165,7 +1164,8 @@ int locks_mandatory_locked(struct file *file) for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { if (!IS_POSIX(fl)) continue; - if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) + if (fl->fl_owner != current->files && + fl->fl_owner != file) break; } spin_unlock(&inode->i_lock); @@ -1205,7 +1205,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, for (;;) { if (filp) { - fl.fl_owner = (fl_owner_t)filp; + fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; error = __posix_lock_file(inode, &fl, NULL); if (!error) @@ -1948,7 +1948,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) cmd = F_GETLK; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2103,7 +2103,7 @@ again: cmd = F_SETLK; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2112,7 +2112,7 @@ again: cmd = F_SETLKW; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW: file_lock->fl_flags |= FL_SLEEP; @@ -2170,7 +2170,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) cmd = F_GETLK64; file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = (fl_owner_t)filp; + file_lock.fl_owner = filp; } error = vfs_test_lock(filp, &file_lock); @@ -2242,7 +2242,7 @@ again: cmd = F_SETLK64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; break; case F_OFD_SETLKW: error = -EINVAL; @@ -2251,7 +2251,7 @@ again: cmd = F_SETLKW64; file_lock->fl_flags |= FL_OFDLCK; - file_lock->fl_owner = (fl_owner_t)filp; + file_lock->fl_owner = filp; /* Fallthrough */ case F_SETLKW64: file_lock->fl_flags |= FL_SLEEP; @@ -2324,11 +2324,11 @@ void locks_remove_file(struct file *filp) if (!inode->i_flock) return; - locks_remove_posix(filp, (fl_owner_t)filp); + locks_remove_posix(filp, filp); if (filp->f_op->flock) { struct file_lock fl = { - .fl_owner = (fl_owner_t)filp, + .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, .fl_flags = FL_FLOCK, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4042ff58fe3f..524dd80d1898 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -361,8 +361,8 @@ start: * Prevent starvation issues if someone is doing a consistency * sync-to-disk */ - ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 44bf0140a4c7..e2a0361e24c6 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -783,8 +783,8 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - nfs_wait_bit_killable, TASK_KILLABLE); + wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 567983d2c0eb..7dd55b745c4d 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -174,7 +174,9 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) static struct key_type key_type_id_resolver = { .name = "id_resolver", - .instantiate = user_instantiate, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, .match = user_match, .revoke = user_revoke, .destroy = user_destroy, @@ -282,6 +284,8 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, desc, "", 0, idmap); mutex_unlock(&idmap->idmap_mutex); } + if (!IS_ERR(rkey)) + set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); kfree(desc); return rkey; @@ -394,7 +398,9 @@ static const struct rpc_pipe_ops idmap_upcall_ops = { static struct key_type key_type_id_resolver_legacy = { .name = "id_legacy", - .instantiate = user_instantiate, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, .match = user_match, .revoke = user_revoke, .destroy = user_destroy, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9927913c97c2..abd37a380535 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,7 +75,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(void *word) +int nfs_wait_bit_killable(struct wait_bit_key *key) { if (fatal_signal_pending(current)) return -ERESTARTSYS; @@ -1074,8 +1074,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) * the bit lock here if it looks like we're going to be doing that. */ for (;;) { - ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, - nfs_wait_bit_killable, TASK_KILLABLE); + ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); if (ret) goto out; spin_lock(&inode->i_lock); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f415cbf9f6c3..617f36611d4a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -348,7 +348,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(void *word); +extern int nfs_wait_bit_killable(struct wait_bit_key *key); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 848f6853c59e..42f121182167 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1251,8 +1251,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); atomic_inc(&clp->cl_count); - res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, - nfs_wait_bit_killable, TASK_KILLABLE); + res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, + nfs_wait_bit_killable, TASK_KILLABLE); if (res) goto out; if (clp->cl_cons_state < 0) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 17fab89f6358..0be5050638f7 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -115,7 +115,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&c->flags); + ret = nfs_wait_bit_killable(&q.key); } while (atomic_read(&c->io_count) != 0); finish_wait(wq, &q.wait); return ret; @@ -136,12 +136,6 @@ nfs_iocounter_wait(struct nfs_io_counter *c) return __nfs_iocounter_wait(c); } -static int nfs_wait_bit_uninterruptible(void *word) -{ - io_schedule(); - return 0; -} - /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -156,7 +150,6 @@ nfs_page_group_lock(struct nfs_page *req) WARN_ON_ONCE(head != head->wb_head); wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - nfs_wait_bit_uninterruptible, TASK_UNINTERRUPTIBLE); } @@ -435,9 +428,8 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - return wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_uninterruptible, - TASK_UNINTERRUPTIBLE); + return wait_on_bit_io(&req->wb_flags, PG_BUSY, + TASK_UNINTERRUPTIBLE); } /* diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6fdcd233d6f7..a8914b335617 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1885,7 +1885,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { if (!sync) goto out; - status = wait_on_bit_lock(&nfsi->flags, + status = wait_on_bit_lock_action(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5e2f10304548..962c9ee758be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -623,7 +623,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int err; /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, + err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, nfs_wait_bit_killable, TASK_KILLABLE); if (err) goto out_err; @@ -1703,7 +1703,7 @@ int nfs_commit_inode(struct inode *inode, int how) return error; if (!may_wait) goto out_mark_dirty; - error = wait_on_bit(&NFS_I(inode)->flags, + error = wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); diff --git a/fs/open.c b/fs/open.c index 36662d036237..d6fd3acde134 100644 --- a/fs/open.c +++ b/fs/open.c @@ -263,11 +263,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EPERM; /* - * We can not allow to do any fallocate operation on an active - * swapfile + * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) - ret = -ETXTBSY; + return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has diff --git a/fs/proc/array.c b/fs/proc/array.c index 64db2bceac59..cd3653e4f35c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -297,15 +297,11 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_puts(m, header); CAP_FOR_EACH_U32(__capi) { seq_printf(m, "%08x", - a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); + a->cap[CAP_LAST_U32 - __capi]); } seq_putc(m, '\n'); } -/* Remove non-existent capabilities */ -#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \ - CAP_TO_MASK(CAP_LAST_CAP + 1) - 1) - static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; @@ -319,11 +315,6 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_bset = cred->cap_bset; rcu_read_unlock(); - NORM_CAPS(cap_inheritable); - NORM_CAPS(cap_permitted); - NORM_CAPS(cap_effective); - NORM_CAPS(cap_bset); - render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); @@ -473,13 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, priority = task_prio(task); nice = task_nice(task); - /* Temporary variable needed for gcc-2.96 */ - /* convert timespec -> nsec*/ - start_time = - (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC - + task->real_start_time.tv_nsec; /* convert nsec -> ticks */ - start_time = nsec_to_clock_t(start_time); + start_time = nsec_to_clock_t(task->real_start_time); seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); seq_put_decimal_ll(m, ' ', ppid); diff --git a/fs/timerfd.c b/fs/timerfd.c index 0013142c0475..80c350216ea8 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -35,8 +35,9 @@ struct timerfd_ctx { ktime_t moffs; wait_queue_head_t wqh; u64 ticks; - int expired; int clockid; + short unsigned expired; + short unsigned settime_flags; /* to show in fdinfo */ struct rcu_head rcu; struct list_head clist; bool might_cancel; @@ -92,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, */ void timerfd_clock_was_set(void) { - ktime_t moffs = ktime_get_monotonic_offset(); + ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); struct timerfd_ctx *ctx; unsigned long flags; @@ -125,7 +126,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx) { if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) return false; - ctx->moffs = ktime_get_monotonic_offset(); + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); return true; } @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, if (timerfd_canceled(ctx)) return -ECANCELED; } + + ctx->settime_flags = flags & TFD_SETTIME_FLAGS; return 0; } @@ -284,11 +287,77 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show(struct seq_file *m, struct file *file) +{ + struct timerfd_ctx *ctx = file->private_data; + struct itimerspec t; + + spin_lock_irq(&ctx->wqh.lock); + t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + t.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + + return seq_printf(m, + "clockid: %d\n" + "ticks: %llu\n" + "settime flags: 0%o\n" + "it_value: (%llu, %llu)\n" + "it_interval: (%llu, %llu)\n", + ctx->clockid, (unsigned long long)ctx->ticks, + ctx->settime_flags, + (unsigned long long)t.it_value.tv_sec, + (unsigned long long)t.it_value.tv_nsec, + (unsigned long long)t.it_interval.tv_sec, + (unsigned long long)t.it_interval.tv_nsec); +} +#else +#define timerfd_show NULL +#endif + +#ifdef CONFIG_CHECKPOINT_RESTORE +static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct timerfd_ctx *ctx = file->private_data; + int ret = 0; + + switch (cmd) { + case TFD_IOC_SET_TICKS: { + u64 ticks; + + if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks))) + return -EFAULT; + if (!ticks) + return -EINVAL; + + spin_lock_irq(&ctx->wqh.lock); + if (!timerfd_canceled(ctx)) { + ctx->ticks = ticks; + if (ticks) + wake_up_locked(&ctx->wqh); + } else + ret = -ECANCELED; + spin_unlock_irq(&ctx->wqh.lock); + break; + } + default: + ret = -ENOTTY; + break; + } + + return ret; +} +#else +#define timerfd_ioctl NULL +#endif + static const struct file_operations timerfd_fops = { .release = timerfd_release, .poll = timerfd_poll, .read = timerfd_read, .llseek = noop_llseek, + .show_fdinfo = timerfd_show, + .unlocked_ioctl = timerfd_ioctl, }; static int timerfd_fget(int fd, struct fd *p) @@ -336,7 +405,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) else hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); - ctx->moffs = ktime_get_monotonic_offset(); + ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); |