summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c24
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h70
-rw-r--r--fs/ext4/ext4_extents.h6
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c296
-rw-r--r--fs/ext4/extents_status.c125
-rw-r--r--fs/ext4/extents_status.h51
-rw-r--r--fs/ext4/file.c23
-rw-r--r--fs/ext4/ialloc.c90
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c364
-rw-r--r--fs/ext4/ioctl.c10
-rw-r--r--fs/ext4/mballoc.c49
-rw-r--r--fs/ext4/migrate.c4
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c35
-rw-r--r--fs/ext4/page-io.c30
-rw-r--r--fs/ext4/super.c82
20 files changed, 830 insertions, 444 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ddd715e42a5c..dc5d572ebd6a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -184,6 +184,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
int flex_bg = 0;
+ struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
@@ -191,11 +192,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
- ext4_free_group_clusters_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -305,7 +304,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
*/
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_grpblk_t offset;
@@ -352,10 +351,11 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return;
@@ -366,12 +366,14 @@ void ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
set_buffer_verified(bh);
@@ -445,7 +447,10 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
- return bh;
+ if (buffer_verified(bh))
+ return bh;
+ put_bh(bh);
+ return NULL;
}
/* Returns 0 on success, 1 on error */
@@ -469,7 +474,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
ext4_validate_block_bitmap(sb, desc, block_group, bh);
- return 0;
+ /* ...but check for error just in case errors=continue. */
+ return !buffer_verified(bh);
}
struct buffer_head *
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3c7d288ae94c..680bb3388919 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,7 +33,7 @@ static int ext4_dx_readdir(struct file *, struct dir_context *);
/**
* Check if the given dir-inode refers to an htree-indexed directory
- * (or a directory which chould potentially get coverted to use htree
+ * (or a directory which could potentially get converted to use htree
* indexing).
*
* Return 1 if it is a dx dir, 0 if not
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b577e45425b0..af815ea9d7cc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -180,7 +180,6 @@ struct ext4_map_blocks {
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
-#define EXT4_IO_END_DIRECT 0x0002
/*
* For converting uninitialized extents on a work queue. 'handle' is used for
@@ -196,8 +195,6 @@ typedef struct ext4_io_end {
unsigned int flag; /* unwritten or not */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct kiocb *iocb; /* iocb struct for AIO */
- int result; /* error value for AIO */
atomic_t count; /* reference counter */
} ext4_io_end_t;
@@ -561,6 +558,18 @@ enum {
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
+
+/*
* Flags used by ext4_free_blocks
*/
#define EXT4_FREE_BLOCKS_METADATA 0x0001
@@ -569,6 +578,7 @@ enum {
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RESERVE 0x0040
/*
* ioctl commands
@@ -590,6 +600,7 @@ enum {
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -900,11 +911,9 @@ struct ext4_inode_info {
* Completed IOs that need unwritten extents handling and don't have
* transaction reserved
*/
- struct list_head i_unrsv_conversion_list;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
- struct work_struct i_unrsv_conversion_work;
spinlock_t i_block_reservation_lock;
@@ -1276,8 +1285,6 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
- /* workqueue for unreserved extent convertions (dio) */
- struct workqueue_struct *unrsv_conversion_wq;
/* workqueue for reserved extent conversions (buffered io) */
struct workqueue_struct *rsv_conversion_wq;
@@ -1340,9 +1347,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- /* Writeback has to have coversion transaction reserved */
- WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
- !(io_end->flag & EXT4_IO_END_DIRECT));
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
@@ -1375,6 +1379,7 @@ enum {
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1915,7 +1920,7 @@ extern ext4_group_t ext4_get_group_number(struct super_block *sb,
extern void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
- unsigned int block_group,
+ ext4_group_t block_group,
struct buffer_head *bh);
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
@@ -2086,6 +2091,7 @@ extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
@@ -2416,16 +2422,32 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
- /*
- * XXX: replace with spinlock if seen contended -bzzz
- */
+ WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+ !mutex_is_locked(&inode->i_mutex));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = newsize;
up_write(&EXT4_I(inode)->i_data_sem);
- return ;
+}
+
+/*
+ * Update i_disksize after writeback has been started. Races with truncate
+ * are avoided by checking i_size under i_data_sem.
+ */
+static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (newsize > i_size)
+ newsize = i_size;
+ if (newsize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
}
struct ext4_group_info {
@@ -2448,9 +2470,15 @@ struct ext4_group_info {
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \
+ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
@@ -2654,6 +2682,12 @@ extern int ext4_check_blockref(const char *, unsigned int,
struct ext4_ext_path;
struct ext4_extent;
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
@@ -2683,7 +2717,8 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *,
struct ext4_ext_path *,
struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
+ struct ext4_ext_path *,
+ int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern int ext4_find_delalloc_range(struct inode *inode,
@@ -2692,7 +2727,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
-
+extern int ext4_ext_precache(struct inode *inode);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2715,7 +2750,6 @@ extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
-extern void ext4_end_io_unrsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 51bc821ade90..5074fe23f19e 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -134,12 +134,6 @@ struct ext4_ext_path {
*/
/*
- * Maximum number of logical blocks in a file; ext4_extent's ee_block is
- * __le32.
- */
-#define EXT_MAX_BLOCKS 0xffffffff
-
-/*
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
* initialized extent. This is 2^15 and not (2^16 - 1), since we use the
* MSB of ee_len field in the extent datastructure to signify if this
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 72a3600aedbd..17ac112ab101 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -255,10 +255,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
- if (err) {
- /* Errors can only happen if there is a bug */
- handle->h_err = err;
- __ext4_journal_stop(where, line, handle);
+ /* Errors can only happen if there is a bug */
+ if (WARN_ON_ONCE(err)) {
+ ext4_journal_abort_handle(where, line, __func__, bh,
+ handle, err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 2877258d9497..81cfefa9dc0c 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -197,7 +197,7 @@ static inline void ext4_journal_callback_add(handle_t *handle,
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
- * Return true if object was sucessfully removed
+ * Return true if object was successfully removed
*/
static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72ba4705d4fa..54d52afcdb19 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -407,7 +407,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth)
+ int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
int max = 0;
@@ -447,42 +447,149 @@ static int __ext4_ext_check(const char *function, unsigned int line,
corrupted:
ext4_error_inode(inode, function, line, 0,
- "bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- error_msg, le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
-
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return -EIO;
}
-#define ext4_ext_check(inode, eh, depth) \
- __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk) \
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
int ext4_ext_check_inode(struct inode *inode)
{
- return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
-static int __ext4_ext_check_block(const char *function, unsigned int line,
- struct inode *inode,
- struct ext4_extent_header *eh,
- int depth,
- struct buffer_head *bh)
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+ struct inode *inode, ext4_fsblk_t pblk, int depth,
+ int flags)
{
- int ret;
+ struct buffer_head *bh;
+ int err;
- if (buffer_verified(bh))
- return 0;
- ret = ext4_ext_check(inode, eh, depth);
- if (ret)
- return ret;
+ bh = sb_getblk(inode->i_sb, pblk);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
+
+ if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+ err = bh_submit_read(bh);
+ if (err < 0)
+ goto errout;
+ }
+ if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+ return bh;
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
set_buffer_verified(bh);
- return ret;
+ /*
+ * If this is a leaf block, cache all of its entries
+ */
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+ struct ext4_extent_header *eh = ext_block_hdr(bh);
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev,
+ lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_uninitialized(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+ }
+ return bh;
+errout:
+ put_bh(bh);
+ return ERR_PTR(err);
+
}
-#define ext4_ext_check_block(inode, eh, depth, bh) \
- __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+#define read_extent_tree_block(inode, pblk, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+ (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_ext_path *path = NULL;
+ struct buffer_head *bh;
+ int i = 0, depth, ret = 0;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return 0; /* not an extent-mapped inode */
+
+ down_read(&ei->i_data_sem);
+ depth = ext_depth(inode);
+
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ up_read(&ei->i_data_sem);
+ return -ENOMEM;
+ }
+
+ /* Don't cache anything if there are no external extent blocks */
+ if (depth == 0)
+ goto out;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+ if (ret)
+ goto out;
+ path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+ while (i >= 0) {
+ /*
+ * If this is a leaf block or we've reached the end of
+ * the index block, go up
+ */
+ if ((i == depth) ||
+ path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx++),
+ depth - i - 1,
+ EXT4_EX_FORCE_CACHE);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ break;
+ }
+ i++;
+ path[i].p_bh = bh;
+ path[i].p_hdr = ext_block_hdr(bh);
+ path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+ up_read(&ei->i_data_sem);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -716,7 +823,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
struct ext4_ext_path *
ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_path *path)
+ struct ext4_ext_path *path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
@@ -748,20 +855,13 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh)) {
- ret = -ENOMEM;
+ bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+ flags);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
goto err;
}
- if (!bh_uptodate_or_lock(bh)) {
- trace_ext4_ext_load_extent(inode, block,
- path[ppos].p_block);
- ret = bh_submit_read(bh);
- if (ret < 0) {
- put_bh(bh);
- goto err;
- }
- }
+
eh = ext_block_hdr(bh);
ppos++;
if (unlikely(ppos > depth)) {
@@ -773,11 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
- i--;
-
- ret = ext4_ext_check_block(inode, eh, i, bh);
- if (ret < 0)
- goto err;
}
path[ppos].p_depth = i;
@@ -1198,7 +1293,8 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- unsigned int flags,
+ unsigned int mb_flags,
+ unsigned int gb_flags,
struct ext4_ext_path *path,
struct ext4_extent *newext)
{
@@ -1220,7 +1316,7 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, flags, path, newext, i);
+ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
goto out;
@@ -1228,12 +1324,12 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, flags, newext);
+ err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
if (err)
goto out;
@@ -1241,7 +1337,7 @@ repeat:
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
- path);
+ path, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -1412,29 +1508,21 @@ got_index:
ix++;
block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
- eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check_block(inode, eh,
- path->p_depth - depth, bh)) {
- put_bh(bh);
- return -EIO;
- }
+ bh = read_extent_tree_block(inode, block,
+ path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+ eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = sb_bread(inode->i_sb, block);
- if (bh == NULL)
- return -EIO;
+ bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = ext_block_hdr(bh);
- if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
- put_bh(bh);
- return -EIO;
- }
ex = EXT_FIRST_EXTENT(eh);
found_extent:
*logical = le32_to_cpu(ex->ee_block);
@@ -1705,7 +1793,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
brelse(path[1].p_bh);
ext4_free_blocks(handle, inode, NULL, blk, 1,
- EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
+ EXT4_FREE_BLOCKS_RESERVE);
}
/*
@@ -1793,7 +1882,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext, int flag)
+ struct ext4_extent *newext, int gb_flags)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1802,7 +1891,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
unsigned uninitialized = 0;
- int flags = 0;
+ int mb_flags = 0;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1817,7 +1906,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
}
/* try to insert block into found extent and return */
- if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
/*
* Try to see whether we should rather test the extent on
@@ -1920,7 +2009,7 @@ prepend:
if (next != EXT_MAX_BLOCKS) {
ext_debug("next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_ext_find_extent(inode, next, NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL, 0);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
@@ -1939,9 +2028,10 @@ prepend:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
- flags = EXT4_MB_USE_RESERVED;
- err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ mb_flags = EXT4_MB_USE_RESERVED;
+ err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+ path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2007,7 +2097,7 @@ has_space:
merge:
/* try to merge extents */
- if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
+ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
ext4_ext_try_to_merge(handle, inode, path, nearex);
@@ -2050,7 +2140,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
path = NULL;
}
- path = ext4_ext_find_extent(inode, block, path);
+ path = ext4_ext_find_extent(inode, block, path, 0);
if (IS_ERR(path)) {
up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
@@ -2195,8 +2285,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_lblk_t block)
{
int depth = ext_depth(inode);
- unsigned long len;
- ext4_lblk_t lblock;
+ unsigned long len = 0;
+ ext4_lblk_t lblock = 0;
struct ext4_extent *ex;
ex = path[depth].p_ext;
@@ -2233,7 +2323,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
ext4_es_insert_extent(inode, lblock, len, ~0,
EXTENT_STATUS_HOLE);
} else {
- lblock = len = 0;
BUG();
}
@@ -2712,7 +2801,7 @@ again:
ext4_lblk_t ee_block;
/* find extent for this block */
- path = ext4_ext_find_extent(inode, end, NULL);
+ path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2754,6 +2843,7 @@ again:
*/
err = ext4_split_extent_at(handle, inode, path,
end + 1, split_flag,
+ EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -2782,7 +2872,7 @@ again:
path[0].p_hdr = ext_inode_hdr(inode);
i = 0;
- if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+ if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
err = -EIO;
goto out;
}
@@ -2829,10 +2919,12 @@ again:
ext_debug("move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
- if (!bh) {
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(bh)) {
/* should we reset i_size? */
- err = -EIO;
+ err = PTR_ERR(bh);
break;
}
/* Yield here to deal with large extent trees.
@@ -2842,11 +2934,6 @@ again:
err = -EIO;
break;
}
- if (ext4_ext_check_block(inode, ext_block_hdr(bh),
- depth - i - 1, bh)) {
- err = -EIO;
- break;
- }
path[i + 1].p_bh = bh;
/* save actual number of indexes since this
@@ -2961,6 +3048,23 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_lblk_t ee_block;
+ ext4_fsblk_t ee_pblock;
+ unsigned int ee_len;
+
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_pblock = ext4_ext_pblock(ex);
+
+ if (ee_len == 0)
+ return 0;
+
+ return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN);
+}
+
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
@@ -3113,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle,
goto fix_extent_len;
/* update extent status tree */
- err = ext4_es_zeroout(inode, &zero_ex);
+ err = ext4_zeroout_es(inode, &zero_ex);
goto out;
} else if (err)
@@ -3133,7 +3237,7 @@ fix_extent_len:
* ext4_split_extents() splits an extent and mark extent which is covered
* by @map as split_flags indicates
*
- * It may result in splitting the extent into multiple extents (upto three)
+ * It may result in splitting the extent into multiple extents (up to three)
* There are three possibilities:
* a> There is no split required
* b> Splits in two extents: Split is happening at either end of the extent
@@ -3181,7 +3285,7 @@ static int ext4_split_extent(handle_t *handle,
* result in split of original leaf or extent zeroout.
*/
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = ext_depth(inode);
@@ -3464,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
out:
/* If we have gotten a failure, don't zero out status tree */
if (!err)
- err = ext4_es_zeroout(inode, &zero_ex);
+ err = ext4_zeroout_es(inode, &zero_ex);
return err ? err : allocated;
}
@@ -3565,7 +3669,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
if (err < 0)
goto out;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4052,7 +4156,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -4744,6 +4848,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return error;
}
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ error = ext4_ext_precache(inode);
+ if (error)
+ return error;
+ }
+
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
@@ -4771,6 +4881,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
error = ext4_fill_fiemap_extents(inode, start_blk,
len_blks, fieinfo);
}
-
+ ext4_es_lru_add(inode);
return error;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 91cb110da1b4..2d1bdbe78c04 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -13,7 +13,6 @@
#include <linux/list_sort.h>
#include "ext4.h"
#include "extents_status.h"
-#include "ext4_extents.h"
#include <trace/events/ext4.h>
@@ -263,7 +262,7 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
if (tree->cache_es) {
es1 = tree->cache_es;
if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u) %llu %llx\n",
+ es_debug("%u cached by [%u/%u) %llu %x\n",
lblk, es1->es_lblk, es1->es_len,
ext4_es_pblock(es1), ext4_es_status(es1));
goto out;
@@ -409,6 +408,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
}
#ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
+
static void ext4_es_insert_extent_ext_check(struct inode *inode,
struct extent_status *es)
{
@@ -419,7 +420,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
unsigned short ee_len;
int depth, ee_status, es_status;
- path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+ path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
if (IS_ERR(path))
return;
@@ -641,13 +642,13 @@ out:
*/
int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned long long status)
+ unsigned int status)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err = 0;
- es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+ es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
if (!len)
@@ -684,6 +685,38 @@ error:
}
/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
+{
+ struct extent_status *es;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ ext4_es_store_pblock(&newes, pblk);
+ ext4_es_store_status(&newes, status);
+ trace_ext4_es_cache_extent(inode, &newes);
+
+ if (!len)
+ return;
+
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+ if (!es || es->es_lblk > end)
+ __es_insert_extent(inode, &newes);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
* ext4_es_lookup_extent() looks up an extent in extent status tree.
*
* ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
@@ -871,23 +904,6 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
return err;
}
-int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
-{
- ext4_lblk_t ee_block;
- ext4_fsblk_t ee_pblock;
- unsigned int ee_len;
-
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
- ee_pblock = ext4_ext_pblock(ex);
-
- if (ee_len == 0)
- return 0;
-
- return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
-}
-
static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
struct list_head *b)
{
@@ -895,6 +911,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
eia = list_entry(a, struct ext4_inode_info, i_es_lru);
eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+ if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return 1;
+ if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return -1;
if (eia->i_touch_when == eib->i_touch_when)
return 0;
if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -908,21 +930,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
{
struct ext4_inode_info *ei;
struct list_head *cur, *tmp;
- LIST_HEAD(skiped);
+ LIST_HEAD(skipped);
int ret, nr_shrunk = 0;
+ int retried = 0, skip_precached = 1, nr_skipped = 0;
spin_lock(&sbi->s_es_lru_lock);
- /*
- * If the inode that is at the head of LRU list is newer than
- * last_sorted time, that means that we need to sort this list.
- */
- ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
- if (sbi->s_es_last_sorted < ei->i_touch_when) {
- list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
- sbi->s_es_last_sorted = jiffies;
- }
-
+retry:
list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
/*
* If we have already reclaimed all extents from extent
@@ -933,9 +947,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- /* Skip the inode that is newer than the last_sorted time */
- if (sbi->s_es_last_sorted < ei->i_touch_when) {
- list_move_tail(cur, &skiped);
+ /*
+ * Skip the inode that is newer than the last_sorted
+ * time. Normally we try hard to avoid shrinking
+ * precached inodes, but we will as a last resort.
+ */
+ if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+ (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))) {
+ nr_skipped++;
+ list_move_tail(cur, &skipped);
continue;
}
@@ -955,11 +976,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
}
/* Move the newer inodes into the tail of the LRU list. */
- list_splice_tail(&skiped, &sbi->s_es_lru);
+ list_splice_tail(&skipped, &sbi->s_es_lru);
+ INIT_LIST_HEAD(&skipped);
+
+ /*
+ * If we skipped any inodes, and we weren't able to make any
+ * forward progress, sort the list and try again.
+ */
+ if ((nr_shrunk == 0) && nr_skipped && !retried) {
+ retried++;
+ list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+ sbi->s_es_last_sorted = jiffies;
+ ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+ i_es_lru);
+ /*
+ * If there are no non-precached inodes left on the
+ * list, start releasing precached extents.
+ */
+ if (ext4_test_inode_state(&ei->vfs_inode,
+ EXT4_STATE_EXT_PRECACHED))
+ skip_precached = 0;
+ goto retry;
+ }
+
spin_unlock(&sbi->s_es_lru_lock);
if (locked_ei && nr_shrunk == 0)
- nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
return nr_shrunk;
}
@@ -1034,10 +1077,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
struct rb_node *node;
struct extent_status *es;
int nr_shrunk = 0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (ei->i_es_lru_nr == 0)
return 0;
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+ __ratelimit(&_rs))
+ ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
node = rb_first(&tree->root);
while (node != NULL) {
es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index e936730cc5b0..167f4ab8ecc3 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -29,16 +29,26 @@
/*
* These flags live in the high bits of extent_status.es_pblk
*/
-#define EXTENT_STATUS_WRITTEN (1ULL << 63)
-#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
-#define EXTENT_STATUS_DELAYED (1ULL << 61)
-#define EXTENT_STATUS_HOLE (1ULL << 60)
+#define ES_SHIFT 60
+
+#define EXTENT_STATUS_WRITTEN (1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED (1 << 1)
+#define EXTENT_STATUS_HOLE (1 << 0)
#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE)
+#define ES_WRITTEN (1ULL << 63)
+#define ES_UNWRITTEN (1ULL << 62)
+#define ES_DELAYED (1ULL << 61)
+#define ES_HOLE (1ULL << 60)
+
+#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \
+ ES_DELAYED | ES_HOLE)
+
struct ext4_sb_info;
struct ext4_extent;
@@ -60,7 +70,10 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned long long status);
+ unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
@@ -68,36 +81,35 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
-extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
static inline int ext4_es_is_written(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
+ return (es->es_pblk & ES_WRITTEN) != 0;
}
static inline int ext4_es_is_unwritten(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
+ return (es->es_pblk & ES_UNWRITTEN) != 0;
}
static inline int ext4_es_is_delayed(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
+ return (es->es_pblk & ES_DELAYED) != 0;
}
static inline int ext4_es_is_hole(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
+ return (es->es_pblk & ES_HOLE) != 0;
}
-static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+static inline unsigned int ext4_es_status(struct extent_status *es)
{
- return (es->es_pblk & EXTENT_STATUS_FLAGS);
+ return es->es_pblk >> ES_SHIFT;
}
static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
- return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+ return es->es_pblk & ~ES_MASK;
}
static inline void ext4_es_store_pblock(struct extent_status *es,
@@ -105,19 +117,16 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
{
ext4_fsblk_t block;
- block = (pb & ~EXTENT_STATUS_FLAGS) |
- (es->es_pblk & EXTENT_STATUS_FLAGS);
+ block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
es->es_pblk = block;
}
static inline void ext4_es_store_status(struct extent_status *es,
- unsigned long long status)
+ unsigned int status)
{
- ext4_fsblk_t block;
-
- block = (status & EXTENT_STATUS_FLAGS) |
- (es->es_pblk & ~EXTENT_STATUS_FLAGS);
- es->es_pblk = block;
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (es->es_pblk & ~ES_MASK));
}
extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6f4cc567c382..3da21945ff1f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -149,7 +149,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 || ret == -EIOCBQUEUED) {
+ if (ret > 0) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
@@ -219,7 +219,6 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
{
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct ext4_inode_info *ei = EXT4_I(inode);
struct vfsmount *mnt = filp->f_path.mnt;
struct path path;
char buf[64], *cp;
@@ -259,22 +258,10 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
- if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
- struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
-
- spin_lock(&inode->i_lock);
- if (!ei->jinode) {
- if (!jinode) {
- spin_unlock(&inode->i_lock);
- return -ENOMEM;
- }
- ei->jinode = jinode;
- jbd2_journal_init_jbd_inode(ei->jinode, inode);
- jinode = NULL;
- }
- spin_unlock(&inode->i_lock);
- if (unlikely(jinode != NULL))
- jbd2_free_inode(jinode);
+ if (filp->f_mode & FMODE_WRITE) {
+ int ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ return ret;
}
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8bf5999875ee..137193ff389b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,18 +70,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
+ struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
- ext4_free_group_clusters_set(sb, gdp, 0);
- ext4_free_inodes_set(sb, gdp, 0);
- ext4_itable_unused_set(sb, gdp, 0);
- memset(bh->b_data, 0xff, sb->s_blocksize);
- ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
- EXT4_INODES_PER_GROUP(sb) / 8);
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return 0;
}
@@ -117,6 +115,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
+ struct ext4_group_info *grp;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -185,6 +184,8 @@ verify:
put_bh(bh);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, bitmap_blk);
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return NULL;
}
ext4_unlock_group(sb, block_group);
@@ -221,6 +222,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
+ struct ext4_group_info *grp;
if (!sb) {
printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
@@ -266,7 +268,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (!bitmap_bh)
+ /* Don't bother if the inode bitmap is corrupt. */
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
goto error_return;
BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -315,8 +319,10 @@ out:
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!fatal)
fatal = err;
- } else
+ } else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ }
error_return:
brelse(bitmap_bh);
@@ -625,6 +631,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
/*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk. (Yes, these values are
+ * somewhat arbitrary...)
+ */
+#define RECENTCY_MIN 5
+#define RECENTCY_DIRTY 30
+
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+ struct ext4_group_desc *gdp;
+ struct ext4_inode *raw_inode;
+ struct buffer_head *bh;
+ unsigned long dtime, now;
+ int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int offset, ret = 0, recentcy = RECENTCY_MIN;
+
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (unlikely(!gdp))
+ return 0;
+
+ bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+ (ino / inodes_per_block));
+ if (unlikely(!bh) || !buffer_uptodate(bh))
+ /*
+ * If the block is not in the buffer cache, then it
+ * must have been written out.
+ */
+ goto out;
+
+ offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+ dtime = le32_to_cpu(raw_inode->i_dtime);
+ now = get_seconds();
+ if (buffer_dirty(bh))
+ recentcy += RECENTCY_DIRTY;
+
+ if (dtime && (dtime < now) && (now < dtime + recentcy))
+ ret = 1;
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -652,6 +703,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
+ struct ext4_group_info *grp;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -725,10 +777,22 @@ got_group:
continue;
}
+ grp = ext4_get_group_info(sb, group);
+ /* Skip groups with already-known suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
- if (!inode_bitmap_bh)
- goto out;
+ /* Skip groups with suspicious inode tables */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
@@ -741,6 +805,11 @@ repeat_in_this_group:
"inode=%lu", ino + 1);
continue;
}
+ if ((EXT4_SB(sb)->s_journal == NULL) &&
+ recently_deleted(sb, group, ino)) {
+ ino++;
+ goto next_inode;
+ }
if (!handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
@@ -764,6 +833,7 @@ repeat_in_this_group:
ino++; /* the inode bitmap is zero-based */
if (!ret2)
goto got; /* we grabbed the inode! */
+next_inode:
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
next_group:
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 87b30cd357e7..594009f5f523 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -23,7 +23,6 @@
#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
-#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dd32a2eacd0d..c79fd7dabe79 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -553,7 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
}
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
@@ -653,7 +653,7 @@ found:
if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
@@ -727,8 +727,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
+ ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+ set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
@@ -969,7 +973,8 @@ retry_journal:
ext4_journal_stop(handle);
goto retry_grab;
}
- wait_on_page_writeback(page);
+ /* In case writeback began while the page was unlocked */
+ wait_for_stable_page(page);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -1633,7 +1638,7 @@ add_delayed:
set_buffer_delay(bh);
} else if (retval > 0) {
int ret;
- unsigned long long status;
+ unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
@@ -1890,12 +1895,32 @@ static int ext4_writepage(struct page *page,
return ret;
}
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+ int len;
+ loff_t size = i_size_read(mpd->inode);
+ int err;
+
+ BUG_ON(page->index != mpd->first_page);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ clear_page_dirty_for_io(page);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ if (!err)
+ mpd->wbc->nr_to_write--;
+ mpd->first_page++;
+
+ return err;
+}
+
#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
/*
* mballoc gives us at most this number of blocks...
* XXX: That seems to be only a limitation of ext4_mb_normalize_request().
- * The rest of mballoc seems to handle chunks upto full group size.
+ * The rest of mballoc seems to handle chunks up to full group size.
*/
#define MAX_WRITEPAGES_EXTENT_LEN 2048
@@ -1904,82 +1929,94 @@ static int ext4_writepage(struct page *page,
*
* @mpd - extent of blocks
* @lblk - logical number of the block in the file
- * @b_state - b_state of the buffer head added
+ * @bh - buffer head we want to add to the extent
*
- * the function is used to collect contig. blocks in same state
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
*/
-static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
- unsigned long b_state)
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+ struct buffer_head *bh)
{
struct ext4_map_blocks *map = &mpd->map;
- /* Don't go larger than mballoc is willing to allocate */
- if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
- return 0;
+ /* Buffer that doesn't need mapping for writeback? */
+ if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+ (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+ /* So far no extent to map => we write the buffer right away */
+ if (map->m_len == 0)
+ return true;
+ return false;
+ }
/* First block in the extent? */
if (map->m_len == 0) {
map->m_lblk = lblk;
map->m_len = 1;
- map->m_flags = b_state & BH_FLAGS;
- return 1;
+ map->m_flags = bh->b_state & BH_FLAGS;
+ return true;
}
+ /* Don't go larger than mballoc is willing to allocate */
+ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+ return false;
+
/* Can we merge the block to our big extent? */
if (lblk == map->m_lblk + map->m_len &&
- (b_state & BH_FLAGS) == map->m_flags) {
+ (bh->b_state & BH_FLAGS) == map->m_flags) {
map->m_len++;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
-static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
- struct buffer_head *head,
- struct buffer_head *bh,
- ext4_lblk_t lblk)
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
+ *
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
+ *
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ struct buffer_head *head,
+ struct buffer_head *bh,
+ ext4_lblk_t lblk)
{
struct inode *inode = mpd->inode;
+ int err;
ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
>> inode->i_blkbits;
do {
BUG_ON(buffer_locked(bh));
- if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
- (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
- lblk >= blocks) {
+ if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
/* Found extent to map? */
if (mpd->map.m_len)
- return false;
- if (lblk >= blocks)
- return true;
- continue;
+ return 0;
+ /* Everything mapped so far and we hit EOF */
+ break;
}
- if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
- return false;
} while (lblk++, (bh = bh->b_this_page) != head);
- return true;
-}
-
-static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
-{
- int len;
- loff_t size = i_size_read(mpd->inode);
- int err;
-
- BUG_ON(page->index != mpd->first_page);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
- else
- len = PAGE_CACHE_SIZE;
- clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
- if (!err)
- mpd->wbc->nr_to_write--;
- mpd->first_page++;
-
- return err;
+ /* So far everything mapped? Submit the page for IO. */
+ if (mpd->map.m_len == 0) {
+ err = mpage_submit_page(mpd, head->b_page);
+ if (err < 0)
+ return err;
+ }
+ return lblk < blocks;
}
/*
@@ -2003,8 +2040,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
struct inode *inode = mpd->inode;
struct buffer_head *head, *bh;
int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
- ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
- >> inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
sector_t pblock;
@@ -2026,7 +2061,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
if (page->index > end)
break;
- /* Upto 'end' pages must be contiguous */
+ /* Up to 'end' pages must be contiguous */
BUG_ON(page->index != start);
bh = head = page_buffers(page);
do {
@@ -2039,18 +2074,26 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
*/
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
- add_page_bufs_to_extent(mpd, head, bh,
- lblk);
+ /*
+ * FIXME: If dioread_nolock supports
+ * blocksize < pagesize, we need to make
+ * sure we add size mapped so far to
+ * io_end->size as the following call
+ * can submit the page for IO.
+ */
+ err = mpage_process_page_bufs(mpd, head,
+ bh, lblk);
pagevec_release(&pvec);
- return 0;
+ if (err > 0)
+ err = 0;
+ return err;
}
if (buffer_delay(bh)) {
clear_buffer_delay(bh);
bh->b_blocknr = pblock++;
}
clear_buffer_unwritten(bh);
- } while (++lblk < blocks &&
- (bh = bh->b_this_page) != head);
+ } while (lblk++, (bh = bh->b_this_page) != head);
/*
* FIXME: This is going to break if dioread_nolock
@@ -2199,12 +2242,10 @@ static int mpage_map_and_submit_extent(handle_t *handle,
/* Update on-disk size after IO is submitted */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
- ext4_update_i_disksize(inode, disksize);
+ ext4_wb_update_i_disksize(inode, disksize);
err2 = ext4_mark_inode_dirty(handle, inode);
if (err2)
ext4_error(inode->i_sb,
@@ -2219,7 +2260,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
/*
* Calculate the total number of credits to reserve for one writepages
* iteration. This is called from ext4_writepages(). We map an extent of
- * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
* the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
* bpp - 1 blocks in bpp different extents.
*/
@@ -2319,14 +2360,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
lblk = ((ext4_lblk_t)page->index) <<
(PAGE_CACHE_SHIFT - blkbits);
head = page_buffers(page);
- if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+ err = mpage_process_page_bufs(mpd, head, head, lblk);
+ if (err <= 0)
goto out;
- /* So far everything mapped? Submit the page for IO. */
- if (mpd->map.m_len == 0) {
- err = mpage_submit_page(mpd, page);
- if (err < 0)
- goto out;
- }
+ err = 0;
/*
* Accumulated enough dirty pages? This doesn't apply
@@ -2410,7 +2447,7 @@ static int ext4_writepages(struct address_space *mapping,
if (ext4_should_dioread_nolock(inode)) {
/*
- * We may need to convert upto one extent per block in
+ * We may need to convert up to one extent per block in
* the page and we may dirty the inode.
*/
rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
@@ -2646,7 +2683,7 @@ retry_journal:
goto retry_grab;
}
/* In case writeback began while the page was unlocked */
- wait_on_page_writeback(page);
+ wait_for_stable_page(page);
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
if (ret < 0) {
@@ -2991,19 +3028,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private, int ret,
- bool is_async)
+ ssize_t size, void *private)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ext4_io_end_t *io_end = iocb->private;
/* if not async direct IO just return */
- if (!io_end) {
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
+ if (!io_end)
return;
- }
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3013,11 +3044,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
iocb->private = NULL;
io_end->offset = offset;
io_end->size = size;
- if (is_async) {
- io_end->iocb = iocb;
- io_end->result = ret;
- }
- ext4_put_io_end_defer(io_end);
+ ext4_put_io_end(io_end);
}
/*
@@ -3102,7 +3129,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ret = -ENOMEM;
goto retake_lock;
}
- io_end->flag |= EXT4_IO_END_DIRECT;
/*
* Grab reference for DIO. Will be dropped in ext4_end_io_dio()
*/
@@ -3147,13 +3173,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- WARN_ON(io_end->iocb);
- /*
- * Generic code already did inode_dio_done() so we
- * have to clear EXT4_IO_END_DIRECT to not do it for
- * the second time.
- */
- io_end->flag = 0;
ext4_put_io_end(io_end);
iocb->private = NULL;
}
@@ -3533,6 +3552,18 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
offset;
}
+ if (offset & (sb->s_blocksize - 1) ||
+ (offset + length) & (sb->s_blocksize - 1)) {
+ /*
+ * Attach jinode to inode for jbd2 if we do any zeroing of
+ * partial block
+ */
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret < 0)
+ goto out_mutex;
+
+ }
+
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
@@ -3601,6 +3632,31 @@ out_mutex:
return ret;
}
+int ext4_inode_attach_jinode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct jbd2_inode *jinode;
+
+ if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
+ return 0;
+
+ jinode = jbd2_alloc_inode(GFP_KERNEL);
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
+ }
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -3661,6 +3717,12 @@ void ext4_truncate(struct inode *inode)
return;
}
+ /* If we zero-out tail of the page, we have to create jinode for jbd2 */
+ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+ if (ext4_inode_attach_jinode(inode) < 0)
+ return;
+ }
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
@@ -4523,7 +4585,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ handle_t *handle;
+ loff_t oldsize = inode->i_size;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4531,73 +4595,69 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
- }
-
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size)) {
- handle_t *handle;
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto err_out;
- }
- if (ext4_handle_valid(handle)) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
- EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
- ext4_journal_stop(handle);
-
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ if (S_ISREG(inode->i_mode) &&
+ (attr->ia_size < inode->i_size)) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error) {
- /* Do as much error cleanup as possible */
- handle = ext4_journal_start(inode,
- EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
+ if (error)
goto err_out;
- }
- ext4_orphan_del(handle, inode);
- orphan = 0;
- ext4_journal_stop(handle);
+ }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
goto err_out;
}
- }
- }
-
- if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != inode->i_size) {
- loff_t oldsize = inode->i_size;
-
- i_size_write(inode, attr->ia_size);
- /*
- * Blocks are going to be removed from the inode. Wait
- * for dio in flight. Temporarily disable
- * dioread_nolock to prevent livelock.
- */
- if (orphan) {
- if (!ext4_should_journal_data(inode)) {
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
- ext4_inode_resume_unlocked_dio(inode);
- } else
- ext4_wait_for_tail_page_commit(inode);
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
}
+ down_write(&EXT4_I(inode)->i_data_sem);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
/*
- * Truncate pagecache after we've waited for commit
- * in data=journal mode to make pages freeable.
+ * We have to update i_size under i_data_sem together
+ * with i_disksize to avoid races with writeback code
+ * running ext4_wb_update_i_disksize().
*/
- truncate_pagecache(inode, oldsize, inode->i_size);
+ if (!error)
+ i_size_write(inode, attr->ia_size);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_journal_stop(handle);
+ if (error) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ } else
+ i_size_write(inode, attr->ia_size);
+
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight. Temporarily disable
+ * dioread_nolock to prevent livelock.
+ */
+ if (orphan) {
+ if (!ext4_should_journal_data(inode)) {
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+ ext4_inode_resume_unlocked_dio(inode);
+ } else
+ ext4_wait_for_tail_page_commit(inode);
}
- ext4_truncate(inode);
+ /*
+ * Truncate pagecache after we've waited for commit
+ * in data=journal mode to make pages freeable.
+ */
+ truncate_pagecache(inode, oldsize, inode->i_size);
}
+ /*
+ * We want to call ext4_truncate() even if attr->ia_size ==
+ * inode->i_size for cases like truncation of fallocated space
+ */
+ if (attr->ia_valid & ATTR_SIZE)
+ ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0590f7..a569d335f804 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,7 +17,6 @@
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
-#include "ext4_extents.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
@@ -77,8 +76,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
- memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
- memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+ ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+ ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode1);
+ ext4_es_lru_del(inode2);
isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
@@ -622,6 +623,8 @@ resizefs_out:
return 0;
}
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ return ext4_ext_precache(inode);
default:
return -ENOTTY;
@@ -686,6 +689,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_MOVE_EXT:
case FITRIM:
case EXT4_IOC_RESIZE_FS:
+ case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4bbbf13bd743..a41e3ba8cfaa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -751,13 +751,15 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u clusters in bitmap, %u in gd",
+ "%u clusters in bitmap, %u in gd; "
+ "block bitmap corrupt.",
free, grp->bb_free);
/*
- * If we intent to continue, we consider group descritor
+ * If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -1398,6 +1400,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ /* Don't bother if the block group is corrupt. */
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return;
+
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1423,7 +1429,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
inode ? inode->i_ino : 0,
blocknr,
"freeing already freed block "
- "(bit %u)", block);
+ "(bit %u); block bitmap corrupt.",
+ block);
+ /* Mark the block group as corrupt. */
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &e4b->bd_info->bb_state);
mb_regenerate_buddy(e4b);
goto done;
}
@@ -1790,6 +1800,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (err)
return err;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+ ext4_mb_unload_buddy(e4b);
+ return 0;
+ }
+
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
@@ -1987,6 +2002,9 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if (cr <= 2 && free < ac->ac_g_ex.fe_len)
return 0;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -4585,6 +4603,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_buddy e4b;
unsigned int count_clusters;
int err = 0;
@@ -4673,6 +4692,10 @@ do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+ ext4_get_group_info(sb, block_group))))
+ return;
+
/*
* Check to see if we are freeing blocks across a group
* boundary.
@@ -4784,7 +4807,6 @@ do_more:
ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
@@ -4792,10 +4814,23 @@ do_more:
&sbi->s_flex_groups[flex_group].free_clusters);
}
- ext4_mb_unload_buddy(&e4b);
-
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter,
+ count_clusters);
+ spin_lock(&ei->i_block_reservation_lock);
+ if (flags & EXT4_FREE_BLOCKS_METADATA)
+ ei->i_reserved_meta_blocks += count_clusters;
+ else
+ ei->i_reserved_data_blocks += count_clusters;
+ spin_unlock(&ei->i_block_reservation_lock);
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_reclaim_block(inode,
+ EXT4_C2B(sbi, count_clusters));
+ } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+
+ ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 49e8bdff9163..2ae73a80c19b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
newext.ee_block = cpu_to_le32(lb->first_block);
newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
ext4_ext_store_pblock(&newext, lb->first_pblock);
- path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+ path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) {
retval = PTR_ERR(path);
@@ -494,7 +494,7 @@ int ext4_ext_migrate(struct inode *inode)
* superblock modification.
*
* For the tmp_inode we already have committed the
- * trascation that created the inode. Later as and
+ * transaction that created the inode. Later as and
* when we add extents we extent the journal
*/
/*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e86dddbd8296..7fa4d855dbd5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
int ret = 0;
struct ext4_ext_path *path;
- path = ext4_ext_find_extent(inode, lblock, *orig_path);
+ path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
if (IS_ERR(path))
ret = PTR_ERR(path);
else if (path[ext_depth(inode)].p_ext == NULL)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 35f55a0dbc4b..1bec5a5c1e45 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3005,15 +3005,19 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
+ *
+ * n.b. old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
*/
static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- handle_t *handle;
+ handle_t *handle = NULL;
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
- int retval, force_da_alloc = 0;
+ int retval;
int inlined = 0, new_inlined = 0;
struct ext4_dir_entry_2 *parent_de;
@@ -3026,14 +3030,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* in separate transaction */
if (new_dentry->d_inode)
dquot_initialize(new_dentry->d_inode);
- handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
- ext4_handle_sync(handle);
old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
/*
@@ -3056,6 +3052,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new_bh = NULL;
}
}
+ if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old_inode);
+
+ handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ ext4_handle_sync(handle);
+
if (S_ISDIR(old_inode->i_mode)) {
if (new_inode) {
retval = -ENOTEMPTY;
@@ -3186,8 +3194,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext4_orphan_add(handle, new_inode);
- if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
- force_da_alloc = 1;
}
retval = 0;
@@ -3195,9 +3201,8 @@ end_rename:
brelse(dir_bh);
brelse(old_bh);
brelse(new_bh);
- ext4_journal_stop(handle);
- if (retval == 0 && force_da_alloc)
- ext4_alloc_da_blocks(old_inode);
+ if (handle)
+ ext4_journal_stop(handle);
return retval;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 6625d210fb45..d7d0c7b46ed4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -123,10 +123,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
- if (io_end->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(io_end->inode);
- if (io_end->iocb)
- aio_complete(io_end->iocb, io_end->result, 0);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -204,19 +200,14 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ /* Only reserved conversions from writeback should enter here */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ WARN_ON(!io_end->handle);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (io_end->handle) {
- wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
- if (list_empty(&ei->i_rsv_conversion_list))
- queue_work(wq, &ei->i_rsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
- } else {
- wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
- if (list_empty(&ei->i_unrsv_conversion_list))
- queue_work(wq, &ei->i_unrsv_conversion_work);
- list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
- }
+ wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ if (list_empty(&ei->i_rsv_conversion_list))
+ queue_work(wq, &ei->i_rsv_conversion_work);
+ list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
@@ -256,13 +247,6 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}
-void ext4_end_io_unrsv_work(struct work_struct *work)
-{
- struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
- i_unrsv_conversion_work);
- ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
-}
-
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 36b141e420b7..049c8a8bdc0e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -762,9 +762,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- flush_workqueue(sbi->unrsv_conversion_wq);
flush_workqueue(sbi->rsv_conversion_wq);
- destroy_workqueue(sbi->unrsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
if (sbi->s_journal) {
@@ -875,14 +873,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
- INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
- INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
return &ei->vfs_inode;
}
@@ -1134,8 +1130,8 @@ enum {
Opt_nouid32, Opt_debug, Opt_removed,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
- Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
- Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+ Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -1179,6 +1175,7 @@ static const match_table_t tokens = {
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_path, "journal_path=%s"},
{Opt_journal_checksum, "journal_checksum"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_abort, "abort"},
@@ -1338,6 +1335,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
#define MOPT_NO_EXT2 0x0100
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING 0x0400
static const struct mount_opts {
int token;
@@ -1359,7 +1357,7 @@ static const struct mount_opts {
{Opt_delalloc, EXT4_MOUNT_DELALLOC,
MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
- MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT},
+ MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_SET},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1387,6 +1385,7 @@ static const struct mount_opts {
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
{Opt_journal_dev, 0, MOPT_GTE0},
+ {Opt_journal_path, 0, MOPT_STRING},
{Opt_journal_ioprio, 0, MOPT_GTE0},
{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
@@ -1480,7 +1479,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
- if (args->from && match_int(args, &arg))
+ if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
return -1;
if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
return -1;
@@ -1544,6 +1543,44 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
*journal_devnum = arg;
+ } else if (token == Opt_journal_path) {
+ char *journal_path;
+ struct inode *journal_inode;
+ struct path path;
+ int error;
+
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ journal_path = match_strdup(&args[0]);
+ if (!journal_path) {
+ ext4_msg(sb, KERN_ERR, "error: could not dup "
+ "journal device string");
+ return -1;
+ }
+
+ error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "error: could not find "
+ "journal device path: error %d", error);
+ kfree(journal_path);
+ return -1;
+ }
+
+ journal_inode = path.dentry->d_inode;
+ if (!S_ISBLK(journal_inode->i_mode)) {
+ ext4_msg(sb, KERN_ERR, "error: journal path %s "
+ "is not a block device", journal_path);
+ path_put(&path);
+ kfree(journal_path);
+ return -1;
+ }
+
+ *journal_devnum = new_encode_dev(journal_inode->i_rdev);
+ path_put(&path);
+ kfree(journal_path);
} else if (token == Opt_journal_ioprio) {
if (arg > 7) {
ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
@@ -3483,7 +3520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (test_opt(sb, DIOREAD_NOLOCK)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and delalloc");
+ "both data=journal and dioread_nolock");
goto failed_mount;
}
if (test_opt(sb, DELALLOC))
@@ -3954,14 +3991,6 @@ no_journal:
goto failed_mount4;
}
- EXT4_SB(sb)->unrsv_conversion_wq =
- alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
- if (!EXT4_SB(sb)->unrsv_conversion_wq) {
- printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
- ret = -ENOMEM;
- goto failed_mount4;
- }
-
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -4115,8 +4144,6 @@ failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- if (EXT4_SB(sb)->unrsv_conversion_wq)
- destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
failed_mount_wq:
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -4564,7 +4591,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
- flush_workqueue(sbi->unrsv_conversion_wq);
/*
* Writeback quota in non-journalled quota case - journalled quota has
* no dirty dquots
@@ -4600,7 +4626,6 @@ static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
trace_ext4_sync_fs(sb, wait);
flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
- flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
dquot_writeback_dquots(sb, -1);
if (wait && test_opt(sb, BARRIER))
ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
@@ -4727,6 +4752,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dioread_nolock");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, "Abort forced by user");