diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/balloc.c | 7 | ||||
-rw-r--r-- | fs/ext4/crypto.c | 80 | ||||
-rw-r--r-- | fs/ext4/crypto_fname.c | 32 | ||||
-rw-r--r-- | fs/ext4/crypto_key.c | 42 | ||||
-rw-r--r-- | fs/ext4/dir.c | 13 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 46 | ||||
-rw-r--r-- | fs/ext4/ext4_crypto.h | 2 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 2 | ||||
-rw-r--r-- | fs/ext4/extents.c | 130 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 4 | ||||
-rw-r--r-- | fs/ext4/file.c | 157 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 8 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 29 | ||||
-rw-r--r-- | fs/ext4/inline.c | 8 | ||||
-rw-r--r-- | fs/ext4/inode.c | 424 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 7 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 83 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 12 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 2 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 34 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 16 | ||||
-rw-r--r-- | fs/ext4/namei.c | 26 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 4 | ||||
-rw-r--r-- | fs/ext4/resize.c | 2 | ||||
-rw-r--r-- | fs/ext4/super.c | 35 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 166 | ||||
-rw-r--r-- | fs/ext4/xattr.h | 3 |
27 files changed, 811 insertions, 563 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index ec0668a60678..fe1f50fe764f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) + if (err) { + ext4_error(sb, "Failed to init block bitmap for group " + "%u: %d", block_group, err); goto out; + } goto verify; } ext4_unlock_group(sb, block_group); diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index c8021208a7eb..edc053a81914 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -18,11 +18,9 @@ * Special Publication 800-38E and IEEE P1619/D16. */ -#include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> -#include <linux/crypto.h> #include <linux/ecryptfs.h> #include <linux/gfp.h> #include <linux/kernel.h> @@ -261,21 +259,21 @@ static int ext4_page_crypto(struct inode *inode, { u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist dst, src; struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback( + skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_crypt_complete, &ecr); @@ -288,21 +286,21 @@ static int ext4_page_crypto(struct inode *inode, sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); if (rw == EXT4_DECRYPT) - res = crypto_ablkcipher_decrypt(req); + res = crypto_skcipher_decrypt(req); else - res = crypto_ablkcipher_encrypt(req); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res) { printk_ratelimited( KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", + "%s: crypto_skcipher_encrypt() returned %d\n", __func__, res); return res; } @@ -467,3 +465,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) return size; return 0; } + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info; + int dir_has_key, cached_with_key; + + if (!ext4_encrypted_inode(dir)) + return 0; + + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + cached_with_key = dentry->d_fsdata != NULL; + dir_has_key = (ci != NULL); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) { +#if 0 /* Revalidation debug */ + char buf[80]; + char *cp = simple_dname(dentry, buf, sizeof(buf)); + + if (IS_ERR(cp)) + cp = (char *) "???"; + pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, + cached_with_key, d_is_negative(dentry), + dir_has_key); +#endif + return 0; + } + return 1; +} + +const struct dentry_operations ext4_encrypted_d_ops = { + .d_revalidate = ext4_d_revalidate, +}; diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 2fbef8a14760..1a2f360405db 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -11,11 +11,9 @@ * */ -#include <crypto/hash.h> -#include <crypto/sha.h> +#include <crypto/skcipher.h> #include <keys/encrypted-type.h> #include <keys/user-type.h> -#include <linux/crypto.h> #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/key.h> @@ -65,10 +63,10 @@ static int ext4_fname_encrypt(struct inode *inode, struct ext4_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; @@ -95,14 +93,14 @@ static int ext4_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); @@ -117,14 +115,14 @@ static int ext4_fname_encrypt(struct inode *inode, /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited( KERN_ERR "%s: Error (error code %d)\n", __func__, res); @@ -145,11 +143,11 @@ static int ext4_fname_decrypt(struct inode *inode, struct ext4_str *oname) { struct ext4_str tmp_in[2], tmp_out[1]; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; unsigned lim = max_name_len(inode); @@ -162,13 +160,13 @@ static int ext4_fname_decrypt(struct inode *inode, tmp_out[0].name = oname->name; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); @@ -178,13 +176,13 @@ static int ext4_fname_decrypt(struct inode *inode, /* Create encryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited( KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 9a16d1e75a49..0129d688d1f7 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -8,6 +8,7 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ +#include <crypto/skcipher.h> #include <keys/encrypted-type.h> #include <keys/user-type.h> #include <linux/random.h> @@ -41,45 +42,42 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) { int res = 0; - struct ablkcipher_request *req = NULL; + struct skcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); if (IS_ERR(tfm)) { res = PTR_ERR(tfm); tfm = NULL; goto out; } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { res = -ENOMEM; goto out; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - EXT4_AES_128_ECB_KEY_SIZE); + res = crypto_skcipher_setkey(tfm, deriving_key, + EXT4_AES_128_ECB_KEY_SIZE); if (res < 0) goto out; sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - EXT4_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); res = ecr.res; } out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); + skcipher_request_free(req); + crypto_free_skcipher(tfm); return res; } @@ -90,7 +88,7 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci) if (ci->ci_keyring_key) key_put(ci->ci_keyring_key); - crypto_free_ablkcipher(ci->ci_ctfm); + crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(ext4_crypt_info_cachep, ci); } @@ -122,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode) struct ext4_encryption_context ctx; const struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct crypto_ablkcipher *ctfm; + struct crypto_skcipher *ctfm; const char *cipher_str; char raw_key[EXT4_MAX_KEY_SIZE]; char mode; @@ -237,7 +235,7 @@ retry: if (res) goto out; got_key: - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; printk(KERN_DEBUG @@ -246,11 +244,11 @@ got_key: goto out; } crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm), CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - ext4_encryption_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); if (res) goto out; memzero_explicit(raw_key, sizeof(raw_key)); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d1bca74f844..33f5e2a50cf8 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + if (ext4_encrypted_inode(inode)) { + err = ext4_get_encryption_info(inode); + if (err && err != -ENOKEY) + return err; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { @@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto errout; + } } if (!bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0662b285dc8a..393689dfa1af 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -42,6 +42,18 @@ */ /* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG @@ -182,9 +194,9 @@ typedef struct ext4_io_end { struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ + atomic_t count; /* reference counter */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ - atomic_t count; /* reference counter */ } ext4_io_end_t; struct ext4_io_submit { @@ -1024,13 +1036,8 @@ struct ext4_inode_info { * transaction reserved */ struct list_head i_rsv_conversion_list; - /* - * Completed IOs that need unwritten extents handling and don't have - * transaction reserved - */ - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ struct work_struct i_rsv_conversion_work; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; @@ -1513,16 +1520,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } -static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) -{ - return inode->i_private; -} - -static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) -{ - inode->i_private = io; -} - /* * Inode dynamic state flags */ @@ -2302,6 +2299,7 @@ struct page *ext4_encrypt(struct inode *inode, int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +extern const struct dentry_operations ext4_encrypted_d_ops; #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); @@ -2505,12 +2503,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); + struct buffer_head *bh_result, int create); +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, @@ -2558,6 +2558,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, + struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -3284,10 +3287,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ EXT4_WQ_HASH_SZ]) -#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; #define EXT4_RESIZING 0 extern int ext4_resize_begin(struct super_block *sb); diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index ac7d4e813796..1f73c29717e1 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -77,7 +77,7 @@ struct ext4_crypt_info { char ci_data_mode; char ci_filename_mode; char ci_flags; - struct crypto_ablkcipher *ci_ctfm; + struct crypto_skcipher *ci_ctfm; struct key *ci_keyring_key; char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 3c9381547094..8ecf84b8f5a1 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0ffabaf90aa5..95bf4679ac54 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -15,7 +15,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ @@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + /* + * The check for IO to unwritten extent is somewhat racy as we + * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after + * dropping i_data_sem. But reserved blocks should save us in that + * case. + */ if (ext4_ext_is_unwritten(ex1) && (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || atomic_read(&EXT4_I(inode)->i_unwritten) || @@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } /* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap + * ext4_ext_determine_hole - determine hole around given block + * @inode: inode we lookup in + * @path: path in extent tree to @lblk + * @lblk: pointer to logical block around which we want to determine hole + * + * Determine hole length (and start if easily possible) around given logical + * block. We don't try too hard to find the beginning of the hole but @path + * actually points to extent before @lblk, we provide it. + * + * The function returns the length of a hole starting at @lblk. We update @lblk + * to the beginning of the hole if we managed to find it. */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t block) +static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *lblk) { int depth = ext_depth(inode); - ext4_lblk_t len; - ext4_lblk_t lblock; struct ext4_extent *ex; - struct extent_status es; + ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ - lblock = 0; + *lblk = 0; len = EXT_MAX_BLOCKS; - ext_debug("cache gap(whole file):"); - } else if (block < le32_to_cpu(ex->ee_block)) { - lblock = block; - len = le32_to_cpu(ex->ee_block) - block; - ext_debug("cache gap(before): %u [%u:%u]", - block, - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); - } else if (block >= le32_to_cpu(ex->ee_block) + } else if (*lblk < le32_to_cpu(ex->ee_block)) { + len = le32_to_cpu(ex->ee_block) - *lblk; + } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; - lblock = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); + *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); - ext_debug("cache gap(after): [%u:%u] %u", - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex), - block); - BUG_ON(next == lblock); - len = next - lblock; + BUG_ON(next == *lblk); + len = next - *lblk; } else { BUG(); } + return len; +} - ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, + ext4_lblk_t hole_len) +{ + struct extent_status es; + + ext4_es_find_delayed_extent_range(inode, hole_start, + hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ - if (es.es_lblk <= lblock) + if (es.es_lblk <= hole_start) return; - len = min(es.es_lblk - lblock, len); + hole_len = min(es.es_lblk - hole_start, hole_len); } - ext_debug(" -> %u:%u\n", lblock, len); - ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); + ext_debug(" -> %u:%u\n", hole_start, hole_len); + ext4_es_insert_extent(inode, hole_start, hole_len, ~0, + EXTENT_STATUS_HOLE); } /* @@ -3927,8 +3943,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + struct ext4_ext_path **ppath, + unsigned int allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; @@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; - ext4_io_end_t *io = ext4_inode_aio(inode); ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " "block %llu, max_blocks %u, flags %x, allocated %u\n", @@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; - /* - * Flag the inode(non aio case) or end_io struct (aio case) - * that this IO needs to conversion to written when IO is - * completed - */ - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } @@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; - ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; - int set_unwritten = 0; bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", @@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = convert_initialized_extent( handle, inode, map, &path, - flags, allocated, newblock); + allocated); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; @@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + ext4_lblk_t hole_start, hole_len; + + hole_start = map->m_lblk; + hole_len = ext4_ext_determine_hole(inode, path, &hole_start); /* * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); + + /* Update hole_len to reflect hole size after map->m_lblk */ + if (hole_start != map->m_lblk) + hole_len -= map->m_lblk - hole_start; + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, hole_len); + goto out2; } @@ -4482,15 +4497,6 @@ got_allocated_blocks: if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; - /* - * io_end structure was created for every IO write to an - * unwritten extent. To avoid unnecessary conversion, - * here we flag the IO that really needs the conversion. - * For non asycn direct IO case, flag the inode state - * that we need to perform conversion when IO is done. - */ - if (flags & EXT4_GET_BLOCKS_PRE_IO) - set_unwritten = 1; } err = 0; @@ -4501,14 +4507,6 @@ got_allocated_blocks: err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (!err && set_unwritten) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } - if (err && free_on_err) { int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ac748b3af1c1..e38b987ac7f5 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -823,8 +823,8 @@ out: es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; - if (!ext4_es_is_referenced(es)) - ext4_es_set_referenced(es); + if (!ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es1); stats->es_stats_cache_hits++; } else { stats->es_stats_cache_misses++; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1126436dada1..6659e216385e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); - struct mutex *aio_mutex = NULL; struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; + int unaligned_aio = 0; int overwrite = 0; ssize_t ret; + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + /* - * Unaligned direct AIO must be serialized; see comment above - * In the case of O_APPEND, assume that we must always serialize + * Unaligned direct AIO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned AIOs can result in data + * corruption. */ - if (o_direct && - ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && - (iocb->ki_flags & IOCB_APPEND || - ext4_unaligned_aio(inode, from, iocb->ki_pos))) { - aio_mutex = ext4_aio_mutex(inode); - mutex_lock(aio_mutex); + ext4_unaligned_aio(inode, from, iocb->ki_pos)) { + unaligned_aio = 1; ext4_unwritten_wait(inode); } - inode_lock(inode); - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto out; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !aio_mutex && + if (ext4_should_dioread_nolock(inode) && !unaligned_aio && !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) blk_finish_plug(&plug); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; out: inode_unlock(inode); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; } @@ -262,23 +256,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return result; } -static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - int err; - struct inode *inode = file_inode(vma->vm_file); - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(inode->i_sb); - - return err; -} - /* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() * handler we check for races agaist truncate. Note that since we cycle through * i_mmap_sem, we are sure that also any hole punching that began before we * were called is finished by now and so if it included part of the file we @@ -311,7 +290,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, - .page_mkwrite = ext4_dax_mkwrite, + .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else @@ -350,6 +329,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; + struct inode *dir = filp->f_path.dentry->d_parent->d_inode; struct path path; char buf[64], *cp; int ret; @@ -393,6 +373,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ext4_encryption_info(inode) == NULL) return -ENOKEY; } + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return -EPERM; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -423,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) */ static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, - struct ext4_map_blocks *map, + ext4_lblk_t end_blk, loff_t *offset) { struct pagevec pvec; @@ -438,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -556,12 +544,11 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t dataoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -578,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; - break; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret <= 0) { + /* No extent found -> no data */ + if (ret == 0) + ret = -ENXIO; + inode_unlock(inode); + return ret; } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + last = es.es_lblk; + if (last != start) + dataoff = (loff_t)last << blkbits; + if (!ext4_es_is_unwritten(&es)) break; - } /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } - - last++; + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + es.es_lblk + es.es_len, &dataoff)) + break; + last += es.es_len; dataoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); @@ -629,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t holeoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -651,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) holeoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret < 0) { + inode_unlock(inode); + return ret; } - - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; + /* Found a hole? */ + if (ret == 0 || es.es_lblk > last) { + if (last != start) + holeoff = (loff_t)last << blkbits; + break; } - /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } - } + if (ext4_es_is_unwritten(&es) && + ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + last + es.es_len, &holeoff)) + break; - /* find a hole */ - break; + last += es.es_len; + holeoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3fcfd50a2e8a..237b877d316d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) + if (err) { + ext4_error(sb, "Failed to init inode bitmap for group " + "%u: %d", block_group, err); goto out; + } return bh; } ext4_unlock_group(sb, block_group); @@ -785,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, sbi = EXT4_SB(sb); /* - * Initalize owners and quota early so that we don't have to account + * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 355ef9c36c87..3027fa681de5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, goto got_it; } - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + /* Next simple case - plain lookup failed */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); + int i; + + /* Count number blocks in a subtree under 'partial' */ + count = 1; + for (i = 0; partial + i != chain + depth - 1; i++) + count *= epb; + /* Fill in size of a hole we found */ + map->m_pblk = 0; + map->m_len = min_t(unsigned int, map->m_len, count); + goto cleanup; + } + + /* Failed read of indirect block */ + if (err == -EIO) goto cleanup; /* @@ -693,21 +708,21 @@ retry: } if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, - ext4_get_block, NULL, 0); + ext4_dio_get_block, NULL, 0); else ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, ext4_get_block, NULL, - NULL, 0); + offset, ext4_dio_get_block, + NULL, NULL, 0); inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) ret = dax_do_io(iocb, inode, iter, offset, - ext4_get_block, NULL, DIO_LOCKING); + ext4_dio_get_block, NULL, DIO_LOCKING); else ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext4_get_block); + ext4_dio_get_block); if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index dfe3b9bafc0d..7cbdd3752ba5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -581,9 +581,10 @@ retry: if (ret) goto out; - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, from, to, ext4_get_block_write); - else + if (ext4_should_dioread_nolock(inode)) { + ret = __block_write_begin(page, from, to, + ext4_get_block_unwritten); + } else ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { @@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle, if (err) goto out; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_mark_inode_dirty(handle, dir); if (unlikely(err)) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 83bc8bfb3bea..b2e9576450eb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode) } truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); - WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - /* * Protect us against freezing - iput() caller didn't have to have any * protection against it @@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocated. - * if create==0 and the blocks are pre-allocated and unwritten block, - * the result buffer head is unmapped. If the create ==1, it will make sure - * the buffer head is mapped. + * On success, it returns the number of blocks being mapped or allocated. if + * create==0 and the blocks are pre-allocated and unwritten, the resulting @map + * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in - * that case, buffer head is unmapped + * that case, @map is returned as unmapped but we still do fill map->m_len to + * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ @@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + map->m_pblk = 0; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; retval = 0; } else { BUG_ON(1); @@ -686,16 +688,39 @@ out_sem: return retval; } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 +/* + * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages + * we have to be careful as someone else may be manipulating b_state as well. + */ +static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) +{ + unsigned long old_state; + unsigned long new_state; + + flags &= EXT4_MAP_FLAGS; + + /* Dummy buffer_head? Set non-atomically. */ + if (!bh->b_page) { + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; + return; + } + /* + * Someone else may be modifying b_state. Be careful! This is ugly but + * once we get rid of using bh as a container for mapping information + * to pass to / from get_block functions, this can go away. + */ + do { + old_state = READ_ONCE(bh->b_state); + new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; + } while (unlikely( + cmpxchg(&bh->b_state, old_state, new_state) != old_state)); +} static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { - handle_t *handle = ext4_journal_current_handle(); struct ext4_map_blocks map; - int ret = 0, started = 0; - int dio_credits; + int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; @@ -703,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { - /* Direct IO write... */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, - dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - started = 1; - } - - ret = ext4_map_blocks(handle, inode, &map, flags); + ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, + flags); if (ret > 0) { - ext4_io_end_t *io_end = ext4_inode_aio(inode); - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) - set_buffer_defer_completion(bh); + ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } - if (started) - ext4_journal_stop(handle); return ret; } @@ -741,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock, } /* + * Get block function used when preparing for buffered write if we require + * creating an unwritten extent if blocks haven't been allocated. The extent + * will be converted to written after the IO is complete. + */ +int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static handle_t *start_dio_trans(struct inode *inode, + struct buffer_head *bh_result) +{ + int dio_credits; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) + bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; + dio_credits = ext4_chunk_trans_blocks(inode, + bh_result->b_size >> inode->i_blkbits); + return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); +} + +/* Get block function for DIO reads and writes to inodes without extents */ +int ext4_dio_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + if (create) { + handle = start_dio_trans(inode, bh); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + ret = _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); + if (create) + ext4_journal_stop(handle); + return ret; +} + +/* + * Get block function for AIO DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete. + */ +static int ext4_dio_get_block_unwritten_async(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + + /* + * When doing DIO using unwritten extents, we need io_end to convert + * unwritten extents to written on IO completion. We allocate io_end + * once we spot unwritten extent and store it in b_private. Generic + * DIO code keeps b_private set and furthermore passes the value to + * our completion callback in 'private' argument. + */ + if (!ret && buffer_unwritten(bh_result)) { + if (!bh_result->b_private) { + ext4_io_end_t *io_end; + + io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!io_end) + return -ENOMEM; + bh_result->b_private = io_end; + ext4_set_io_unwritten_flag(inode, io_end); + } + set_buffer_defer_completion(bh_result); + } + + return ret; +} + +/* + * Get block function for non-AIO DIO writes when we create unwritten extent if + * blocks are not allocated yet. The extent will be converted to written + * after IO is complete from ext4_ext_direct_IO() function. + */ +static int ext4_dio_get_block_unwritten_sync(struct inode *inode, + sector_t iblock, struct buffer_head *bh_result, int create) +{ + handle_t *handle; + int ret; + + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + handle = start_dio_trans(inode, bh_result); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); + ext4_journal_stop(handle); + + /* + * Mark inode as having pending DIO writes to unwritten extents. + * ext4_ext_direct_IO() checks this flag and converts extents to + * written. + */ + if (!ret && buffer_unwritten(bh_result)) + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + + return ret; +} + +static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + + ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", + inode->i_ino, create); + /* We don't expect handle for direct IO */ + WARN_ON_ONCE(ext4_journal_current_handle()); + + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); + + return ret; +} + + +/* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, @@ -1051,13 +1206,14 @@ retry_journal: #ifdef CONFIG_EXT4_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(page, pos, len, - ext4_get_block_write); + ext4_get_block_unwritten); else ret = ext4_block_write_begin(page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, ext4_get_block_write); + ret = __block_write_begin(page, pos, len, + ext4_get_block_unwritten); else ret = __block_write_begin(page, pos, len, ext4_get_block); #endif @@ -1685,7 +1841,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked @@ -2450,6 +2606,10 @@ static int ext4_writepages(struct address_space *mapping, trace_ext4_writepages(inode, wbc); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -3056,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } -/* - * ext4_get_block used when preparing for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after the IO is complete. - */ -int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); -} - -static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - - ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", - inode->i_ino, create); - ret = _ext4_get_block(inode, iblock, bh_result, 0); - /* - * Blocks should have been preallocated! ext4_file_write_iter() checks - * that. - */ - WARN_ON_ONCE(!buffer_mapped(bh_result)); - - return ret; -} - #ifdef CONFIG_FS_DAX int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -3147,13 +3276,12 @@ out: WARN_ON_ONCE(ret == 0 && create); if (ret > 0) { map_bh(bh_result, inode->i_sb, map.m_pblk); - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | - map.m_flags; /* * At least for now we have to clear BH_New so that DAX code * doesn't attempt to zero blocks again in a racy way. */ - bh_result->b_state &= ~(1 << BH_New); + map.m_flags &= ~EXT4_MAP_NEW; + ext4_update_bh_state(bh_result, map.m_flags); bh_result->b_size = map.m_len << inode->i_blkbits; ret = 0; } @@ -3164,7 +3292,7 @@ out: static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { - ext4_io_end_t *io_end = iocb->private; + ext4_io_end_t *io_end = private; /* if not async direct IO just return */ if (!io_end) @@ -3172,10 +3300,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - iocb->private, io_end->inode->i_ino, iocb, offset, - size); + io_end, io_end->inode->i_ino, iocb, offset, size); - iocb->private = NULL; io_end->offset = offset; io_end->size = size; ext4_put_io_end(io_end); @@ -3211,7 +3337,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; - ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) @@ -3236,16 +3361,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* * We could direct write to holes and fallocate. * - * Allocated blocks to fill the hole are marked as - * unwritten to prevent parallel buffered read to expose - * the stale data before DIO complete the data IO. + * Allocated blocks to fill the hole are marked as unwritten to prevent + * parallel buffered read to expose the stale data before DIO complete + * the data IO. * - * As to previously fallocated extents, ext4 get_block will - * just simply mark the buffer mapped but still keep the - * extents unwritten. + * As to previously fallocated extents, ext4 get_block will just simply + * mark the buffer mapped but still keep the extents unwritten. * - * For non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. + * For non AIO case, we will convert those unwritten extents to written + * after return back from blockdev_direct_IO. That way we save us from + * allocating io_end structure and also the overhead of offloading + * the extent convertion to a workqueue. * * For async DIO, the conversion needs to be deferred when the * IO is completed. The ext4 end_io callback function will be @@ -3253,30 +3379,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - - if (overwrite) { - get_block_func = ext4_get_block_overwrite; + if (overwrite) + get_block_func = ext4_dio_get_block_overwrite; + else if (is_sync_kiocb(iocb)) { + get_block_func = ext4_dio_get_block_unwritten_sync; + dio_flags = DIO_LOCKING; } else { - get_block_func = ext4_get_block_write; + get_block_func = ext4_dio_get_block_unwritten_async; dio_flags = DIO_LOCKING; } #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -3291,27 +3400,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); - /* - * Put our reference to io_end. This can free the io_end structure e.g. - * in sync IO case or in case of error. It can even perform extent - * conversion if all bios we submitted finished before we got here. - * Note that in that case iocb->private can be already set to NULL - * here. - */ - if (io_end) { - ext4_inode_aio_set(inode, NULL); - ext4_put_io_end(io_end); - /* - * When no IO was submitted ext4_end_io_dio() was not - * called so we have to put iocb's reference. - */ - if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { - WARN_ON(iocb->private != io_end); - WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); - ext4_put_io_end(io_end); - iocb->private = NULL; - } - } if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; @@ -3326,7 +3414,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } -retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ @@ -4127,7 +4214,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -5229,6 +5316,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { @@ -5259,9 +5348,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) } } } - if (!err) - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - return err; + return ext4_mark_iloc_dirty(handle, inode, &iloc); } /* @@ -5470,7 +5557,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unlock_page(page); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) - get_block = ext4_get_block_write; + get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: @@ -5513,3 +5600,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return err; } + +/* + * Find the first extent at or after @lblk in an inode that is not a hole. + * Search for @map_len blocks at most. The extent is returned in @result. + * + * The function returns 1 if we found an extent. The function returns 0 in + * case there is no extent at or after @lblk and in that case also sets + * @result->es_len to 0. In case of error, the error code is returned. + */ +int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, struct extent_status *result) +{ + struct ext4_map_blocks map; + struct extent_status es = {}; + int ret; + + map.m_lblk = lblk; + map.m_len = map_len; + + /* + * For non-extent based files this loop may iterate several times since + * we do not determine full hole size. + */ + while (map.m_len > 0) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + /* There's extent covering m_lblk? Just return it. */ + if (ret > 0) { + int status; + + ext4_es_store_pblock(result, map.m_pblk); + result->es_lblk = map.m_lblk; + result->es_len = map.m_len; + if (map.m_flags & EXT4_MAP_UNWRITTEN) + status = EXTENT_STATUS_UNWRITTEN; + else + status = EXTENT_STATUS_WRITTEN; + ext4_es_store_status(result, status); + return 1; + } + ext4_es_find_delayed_extent_range(inode, map.m_lblk, + map.m_lblk + map.m_len - 1, + &es); + /* Is delalloc data before next block in extent tree? */ + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { + ext4_lblk_t offset = 0; + + if (es.es_lblk < lblk) + offset = lblk - es.es_lblk; + result->es_lblk = es.es_lblk + offset; + ext4_es_store_pblock(result, + ext4_es_pblock(&es) + offset); + result->es_len = es.es_len - offset; + ext4_es_store_status(result, ext4_es_status(&es)); + + return 1; + } + /* There's a hole at m_lblk, advance us after it */ + map.m_lblk += map.m_len; + map_len -= map.m_len; + map.m_len = map_len; + cond_resched(); + } + result->es_len = 0; + return 0; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0f6c36922c24..eae5917c534e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -208,7 +208,7 @@ static int ext4_ioctl_setflags(struct inode *inode, { struct ext4_inode_info *ei = EXT4_I(inode); handle_t *handle = NULL; - int err = EPERM, migrate = 0; + int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; unsigned int jflag; @@ -583,6 +583,11 @@ group_extend_out: "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; goto mext_out; + } else if (IS_DAX(inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 61eaf74dca37..50e05df28f66 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -11,7 +11,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public Licens + * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ @@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore) +static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; int blocksize; @@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; - bh = kzalloc(i, GFP_NOFS); + bh = kzalloc(i, gfp); if (bh == NULL) { err = -ENOMEM; goto out; @@ -983,7 +983,7 @@ out: * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, - ext4_group_t group, struct ext4_buddy *e4b) + ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; @@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block++; pnum = block / blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); @@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) * calling this routine! */ static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; @@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group @@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) } /* init buddy cache */ page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); if (ret) goto err; if (!PageUptodate(page)) { @@ -1109,8 +1109,8 @@ err: * calling this routine! */ static noinline_for_stack int -ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) +ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; @@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * we need full data about the group * to make a good selection */ - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } @@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * wait for it to initialize. */ page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { unlock_page(page); goto err; @@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + gfp); if (ret) { unlock_page(page); goto err; @@ -1247,6 +1248,12 @@ err: return ret; } +static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); +} + static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) @@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group); + int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); if (ret) return ret; } @@ -2285,7 +2292,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " - " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); @@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } /* - * We need to make sure we don't reuse the freed block until - * after the transaction is committed, which we can do by - * treating the block as metadata, below. We make an - * exception if the inode is to be written in writeback mode - * since writeback mode has weak data consistency guarantees. - */ - if (!ext4_should_writeback_data(inode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free @@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; + int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); - bh = sb_find_get_block(inode->i_sb, block + i); - if (!bh) - continue; - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, - inode, bh, block + i); + if (is_metadata) + bh = sb_find_get_block(inode->i_sb, block + i); + ext4_forget(handle, is_metadata, inode, bh, block + i); } } @@ -4815,16 +4811,23 @@ do_more: #endif trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); - err = ext4_mb_load_buddy(sb, block_group, &e4b); + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); if (err) goto error_return; - if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + /* + * We need to make sure we don't reuse the freed block until after the + * transaction is committed. We make an exception if the inode is to be + * written in writeback mode since writeback mode has weak data + * consistency guarantees. + */ + if (ext4_handle_valid(handle) && + ((flags & EXT4_FREE_BLOCKS_METADATA) || + !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* - * blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed - * * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ @@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(sb, group); + ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d634e183b4d4..3ef1df6ae9ec 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,18 +23,6 @@ #include "ext4.h" /* - * with AGGRESSIVE_CHECK allocator runs consistency checks over - * structures. these checks slow things down a lot - */ -#define AGGRESSIVE_CHECK__ - -/* - * with DOUBLE_CHECK defined mballoc creates persistent in-core - * bitmaps, maintains and uses them to check for double allocations - */ -#define DOUBLE_CHECK__ - -/* */ #ifdef CONFIG_EXT4_DEBUG extern ushort ext4_mballoc_debug; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a4651894cc33..364ea4d4a943 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * blocks. * * While converting to extents we need not - * update the orignal inode i_blocks for extent blocks + * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0a512aa81bf7..24445275d330 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; ret = -EIO; goto warn_exit; } - mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) { ret = -EFSCORRUPTED; - else if (!ext4_mmp_csum_verify(sb, mmp)) + goto warn_exit; + } + if (!ext4_mmp_csum_verify(sb, mmp)) { ret = -EFSBADCRC; - else - return 0; - + goto warn_exit; + } + return 0; warn_exit: + brelse(*bh); + *bh = NULL; ext4_warning(sb, "Error %d while reading MMP block %llu", ret, mmp_block); return ret; @@ -181,15 +182,13 @@ static int kmmpd(void *data) EXT4_FEATURE_INCOMPAT_MMP)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } if (sb->s_flags & MS_RDONLY) { ext4_warning(sb, "kmmpd being stopped since filesystem " "has been remounted as readonly."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } diff = jiffies - last_update_time; @@ -211,9 +210,7 @@ static int kmmpd(void *data) if (retval) { ext4_error(sb, "error reading MMP data: %d", retval); - - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; + goto exit_thread; } mmp_check = (struct mmp_struct *)(bh_check->b_data); @@ -225,7 +222,9 @@ static int kmmpd(void *data) "The filesystem seems to have been" " multiply mounted."); ext4_error(sb, "abort"); - goto failed; + put_bh(bh_check); + retval = -EBUSY; + goto exit_thread; } put_bh(bh_check); } @@ -248,7 +247,8 @@ static int kmmpd(void *data) retval = write_mmp_block(sb, bh); -failed: +exit_thread: + EXT4_SB(sb)->s_mmp_tsk = NULL; kfree(data); brelse(bh); return retval; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index fb6f11709ae6..4098acc701c3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; - int err2, jblocks, retries = 0; + int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; + struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because @@ -380,8 +381,17 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - *err = __block_write_begin(pagep[0], from, replaced_size, - ext4_get_block); + if (!page_has_buffers(pagep[0])) + create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); + bh = page_buffers(pagep[0]); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); + if (*err < 0) + break; + bh = bh->b_this_page; + } if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 06574dd77614..48e4b8907826 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; + if (ext4_encrypted_inode(dir)) { + int res = ext4_get_encryption_info(dir); + + /* + * This should be a properly defined flag for + * dentry->d_flags when we uplift this to the VFS. + * d_fsdata is set to (void *) 1 if if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + dentry->d_fsdata = NULL; + if (ext4_encryption_info(dir)) + dentry->d_fsdata = (void *) 1; + d_set_d_op(dentry, &ext4_encrypted_d_ops); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } + if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !ext4_is_child_context_consistent_with_parent(dir, inode)) { + int nokey = ext4_encrypted_inode(inode) && + !ext4_encryption_info(inode); + iput(inode); + if (nokey) + return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu\n", (unsigned long) dir->i_ino, diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 090b3498638e..349d7aa04fe7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); WARN_ON(io_end->handle); - if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io_end->inode)); - for (bio = io_end->bio; bio; bio = next_bio) { next_bio = bio->bi_private; ext4_finish_bio(bio); @@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) { ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); if (io) { - atomic_inc(&EXT4_I(inode)->i_ioend_count); io->inode = inode; INIT_LIST_HEAD(&io->list); atomic_set(&io->count, 1); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ad62d7acc315..34038e3598d5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) if (flex_gd == NULL) goto out3; - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data)) goto out2; flex_gd->count = flexbg_size; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ed01ec011d7..99996e9a8f57 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -55,7 +55,6 @@ static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; -static int ext4_mballoc_ready; static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, @@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb) ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); - ext4_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { ext4_clear_feature_journal_needs_recovery(sb); @@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -1425,9 +1422,9 @@ static const struct mount_opts { {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2 | MOPT_SET}, + MOPT_NO_EXT2}, {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, - MOPT_NO_EXT2 | MOPT_CLEAR}, + MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, @@ -1705,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, ext4_msg(sb, KERN_INFO, "dax option not supported"); return -1; #endif + } else if (token == Opt_data_err_abort) { + sbi->s_mount_opt |= m->mount_opt; + } else if (token == Opt_data_err_ignore) { + sbi->s_mount_opt &= ~m->mount_opt; } else { if (!args->from) arg = 1; @@ -1914,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); + if (test_opt(sb, DATA_ERR_ABORT)) + SEQ_OPTS_PUTS("data_err=abort"); ext4_show_quota_options(seq, sb); return 0; @@ -3796,12 +3799,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; no_journal: - if (ext4_mballoc_ready) { - sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); - if (!sbi->s_mb_cache) { - ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); - goto failed_mount_wq; - } + sbi->s_mb_cache = ext4_xattr_create_cache(); + if (!sbi->s_mb_cache) { + ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); + goto failed_mount_wq; } if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && @@ -4027,6 +4028,10 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: + if (sbi->s_mb_cache) { + ext4_xattr_destroy_cache(sbi->s_mb_cache); + sbi->s_mb_cache = NULL; + } if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -5321,7 +5326,6 @@ MODULE_ALIAS_FS("ext4"); /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; static int __init ext4_init_fs(void) { @@ -5334,10 +5338,8 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { - mutex_init(&ext4__aio_mutex[i]); + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) init_waitqueue_head(&ext4__ioend_wq[i]); - } err = ext4_init_es(); if (err) @@ -5358,8 +5360,6 @@ static int __init ext4_init_fs(void) err = ext4_init_mballoc(); if (err) goto out2; - else - ext4_mballoc_ready = 1; err = init_inodecache(); if (err) goto out1; @@ -5375,7 +5375,6 @@ out: unregister_as_ext3(); destroy_inodecache(); out1: - ext4_mballoc_ready = 0; ext4_exit_mballoc(); out2: ext4_exit_sysfs(); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a95151e875bd..0441e055c8e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -545,30 +545,44 @@ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - struct mb_cache_entry *ce = NULL; - int error = 0; struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); + u32 hash, ref; + int error = 0; - ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bh); if (error) goto out; lock_buffer(bh); - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + hash = le32_to_cpu(BHDR(bh)->h_hash); + ref = le32_to_cpu(BHDR(bh)->h_refcount); + if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ + mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); get_bh(bh); unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); + ref--; + BHDR(bh)->h_refcount = cpu_to_le32(ref); + if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { + struct mb_cache_entry *ce; + + ce = mb_cache_entry_get(ext4_mb_cache, hash, + bh->b_blocknr); + if (ce) { + ce->e_reusable = 1; + mb_cache_entry_put(ext4_mb_cache, ce); + } + } + /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect @@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (i->value && i->value_len > sb->s_blocksize) return -ENOSPC; if (s->base) { - ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, bs->bh); if (error) @@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } + __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); + + /* + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect modified + * block + */ + mb_cache_entry_delete_block(ext4_mb_cache, hash, + bs->bh->b_blocknr); ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s); if (!error) { @@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } ea_bdebug(bs->bh, "cloning"); s->base = kmalloc(bs->bh->b_size, GFP_NOFS); error = -ENOMEM; @@ -872,6 +885,8 @@ inserted: if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { + u32 ref; + /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -884,9 +899,40 @@ inserted: if (error) goto cleanup_dquot; lock_buffer(new_bh); - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + /* + * We have to be careful about races with + * freeing, rehashing or adding references to + * xattr block. Once we hold buffer lock xattr + * block's state is stable so we can check + * whether the block got freed / rehashed or + * not. Since we unhash mbcache entry under + * buffer lock when freeing / rehashing xattr + * block, checking whether entry is still + * hashed is reliable. Same rules hold for + * e_reusable handling. + */ + if (hlist_bl_unhashed(&ce->e_hash_list) || + !ce->e_reusable) { + /* + * Undo everything and check mbcache + * again. + */ + unlock_buffer(new_bh); + dquot_free_block(inode, + EXT4_C2B(EXT4_SB(sb), + 1)); + brelse(new_bh); + mb_cache_entry_put(ext4_mb_cache, ce); + ce = NULL; + new_bh = NULL; + goto inserted; + } + ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); + if (ref >= EXT4_XATTR_REFCOUNT_MAX) + ce->e_reusable = 0; ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); + ref); unlock_buffer(new_bh); error = ext4_handle_dirty_xattr_block(handle, inode, @@ -894,7 +940,8 @@ inserted: if (error) goto cleanup_dquot; } - mb_cache_entry_release(ce); + mb_cache_entry_touch(ext4_mb_cache, ce); + mb_cache_entry_put(ext4_mb_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ @@ -959,7 +1006,7 @@ getblk_failed: cleanup: if (ce) - mb_cache_entry_release(ce); + mb_cache_entry_put(ext4_mb_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); @@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, return 0; } +static int ext4_xattr_value_same(struct ext4_xattr_search *s, + struct ext4_xattr_info *i) +{ + void *value; + + if (le32_to_cpu(s->here->e_value_size) != i->value_len) + return 0; + value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); + return !memcmp(value, i->value, i->value_len); +} + /* * ext4_xattr_set_handle() * @@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { + error = 0; + /* Xattr value did not change? Save us some work and bail out */ + if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) + goto cleanup; + if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) + goto cleanup; + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; @@ -1512,17 +1577,6 @@ cleanup: } /* - * ext4_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext4_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - -/* * ext4_xattr_cache_insert() * * Create a new entry in the extended attribute cache, and insert @@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb) static void ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) { - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; + struct ext4_xattr_header *header = BHDR(bh); + __u32 hash = le32_to_cpu(header->h_hash); + int reusable = le32_to_cpu(header->h_refcount) < + EXT4_XATTR_REFCOUNT_MAX; int error; - ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, + bh->b_blocknr, reusable); if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { + if (error == -EBUSY) ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { + } else ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } } /* @@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, - hash); + ce = mb_cache_entry_find_first(ext4_mb_cache, hash); while (ce) { struct buffer_head *bh; - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT4_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT4_XATTR_REFCOUNT_MAX); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + ce = mb_cache_entry_find_next(ext4_mb_cache, ce); } return NULL; } @@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, #define HASH_BUCKET_BITS 10 struct mb_cache * -ext4_xattr_create_cache(char *name) +ext4_xattr_create_cache(void) { - return mb_cache_create(name, HASH_BUCKET_BITS); + return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ddc0957760ba..69dd3e6566e0 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); extern void ext4_xattr_delete_inode(handle_t *, struct inode *); -extern void ext4_xattr_put_super(struct super_block *); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); @@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); -extern struct mb_cache *ext4_xattr_create_cache(char *name); +extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); #ifdef CONFIG_EXT4_FS_SECURITY |